mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Update GitHub pages in root to v0.21.0rc0
This commit is contained in:
parent
7fd841d6de
commit
2caf860fe7
@ -1,4 +1,4 @@
|
||||
# Sphinx build info version 1
|
||||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
|
||||
config: 12c1352bd1428d2c6ac709024163b9d8
|
||||
config: 5c850ce0a6f2d0ce79a91d25fbeeb241
|
||||
tags: 645f666f9bcd5a90fca523b33c5a78b7
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -51,7 +51,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -63,7 +63,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -336,6 +336,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -357,6 +358,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -421,6 +423,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -455,6 +458,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -2190,6 +2194,18 @@
|
||||
<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32" title="Link to this definition">#</a><br /></dt>
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="cpp function">
|
||||
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32">
|
||||
<span id="_CPPv3NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32"></span><span id="tensorrt_llm::runtime::ModelConfig::getFirstLocalLayer__SizeType32.SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1ModelConfig_1a2c819f0d4717a6ad56c2f701f0ff1698"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getFirstLocalLayer</span></span></span><span class="sig-paren">(</span>
|
||||
|
||||
<dl>
|
||||
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelism</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">1</span></span></em>,</dd>
|
||||
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelismRank</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span></em>,</dd>
|
||||
</dl>
|
||||
|
||||
<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32" title="Link to this definition">#</a><br /></dt>
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="cpp function">
|
||||
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32">
|
||||
<span id="_CPPv3NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32"></span><span id="tensorrt_llm::runtime::ModelConfig::countLowerRankLayers__LayerType.SizeType32.SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1ModelConfig_1a4c9cabd1675a0db58bce743a0ac0470e"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">countLowerRankLayers</span></span></span><span class="sig-paren">(</span>
|
||||
@ -2204,8 +2220,15 @@
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="cpp function">
|
||||
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32">
|
||||
<span id="_CPPv3NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32"></span><span id="tensorrt_llm::runtime::ModelConfig::getNbLayers__SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1ModelConfig_1aefd69a08c1409f90a4e948d857cc08b1"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getNbLayers</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelism</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">1</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32" title="Link to this definition">#</a><br /></dt>
|
||||
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32">
|
||||
<span id="_CPPv3NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32"></span><span id="tensorrt_llm::runtime::ModelConfig::getNbLayers__SizeType32.SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1ModelConfig_1aba756e17c1d83a61adf10f12a3787479"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getNbLayers</span></span></span><span class="sig-paren">(</span>
|
||||
|
||||
<dl>
|
||||
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelism</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">1</span></span></em>,</dd>
|
||||
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelismRank</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span></em>,</dd>
|
||||
</dl>
|
||||
|
||||
<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32" title="Link to this definition">#</a><br /></dt>
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="cpp function">
|
||||
@ -11199,6 +11222,19 @@ one more than decoding draft tokens for prediction from primary head </p>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="cpp function">
|
||||
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32">
|
||||
<span id="_CPPv3NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32"></span><span id="tensorrt_llm::runtime::decoder::DecoderState::getSequenceLengths__SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1decoder_1_1DecoderState_1ad9521ae6439b0704412f786c854c9145"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState9TensorPtrE" title="tensorrt_llm::runtime::decoder::DecoderState::TensorPtr"><span class="n"><span class="pre">TensorPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getSequenceLengths</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">batchIdx</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32" title="Link to this definition">#</a><br /></dt>
|
||||
<dd><dl class="field-list simple">
|
||||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||||
<dd class="field-odd"><p><strong>batchIdx</strong> – index of the batch </p>
|
||||
</dd>
|
||||
<dt class="field-even">Returns<span class="colon">:</span></dt>
|
||||
<dd class="field-even"><p>[maxBeamWidth], sequence lengths for request <code class="docutils literal notranslate"><span class="pre">batchIdx</span></code>, on gpu </p>
|
||||
</dd>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="cpp function">
|
||||
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv">
|
||||
<span id="_CPPv3NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv"></span><span id="_CPPv2NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv"></span><span id="tensorrt_llm::runtime::decoder::DecoderState::getAllNewTokensC"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1decoder_1_1DecoderState_1a1313811f8c18a59d45a542374ee5f6df"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState9TensorPtrE" title="tensorrt_llm::runtime::decoder::DecoderState::TensorPtr"><span class="n"><span class="pre">TensorPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getAllNewTokens</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv" title="Link to this definition">#</a><br /></dt>
|
||||
@ -11270,6 +11306,11 @@ one more than decoding draft tokens for prediction from primary head </p>
|
||||
</dl>
|
||||
</dd></dl>
|
||||
|
||||
<dl class="cpp function">
|
||||
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv">
|
||||
<span id="_CPPv3NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv"></span><span id="_CPPv2NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv"></span><span id="tensorrt_llm::runtime::decoder::DecoderState::getMaxBatchSizeC"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1decoder_1_1DecoderState_1afa651d891bae6694a10aa7288c3724d9"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getMaxBatchSize</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv" title="Link to this definition">#</a><br /></dt>
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="cpp function">
|
||||
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv">
|
||||
<span id="_CPPv3NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv"></span><span id="_CPPv2NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv"></span><span id="tensorrt_llm::runtime::decoder::DecoderState::getMaxBeamWidthC"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1decoder_1_1DecoderState_1affb5c3e06a18f4e511a8f2662ed59013"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getMaxBeamWidth</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv" title="Link to this definition">#</a><br /></dt>
|
||||
@ -11500,6 +11541,11 @@ one more than decoding draft tokens for prediction from primary head </p>
|
||||
<span id="_CPPv3N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE"></span><span id="_CPPv2N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE"></span><span id="tensorrt_llm::runtime::AllReduceBuffers::mAllReduceCommPtrs__TensorPtr"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1AllReduceBuffers_1ab48e63279d11f42d71c3621820d2520c"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE" title="tensorrt_llm::runtime::AllReduceBuffers::TensorPtr"><span class="n"><span class="pre">TensorPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mAllReduceCommPtrs</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE" title="Link to this definition">#</a><br /></dt>
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="cpp var">
|
||||
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE">
|
||||
<span id="_CPPv3N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE"></span><span id="_CPPv2N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE"></span><span id="tensorrt_llm::runtime::AllReduceBuffers::mFlagPtrs__TensorPtr"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1AllReduceBuffers_1a304f00427fcda4b28d5b235fef1a544c"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE" title="tensorrt_llm::runtime::AllReduceBuffers::TensorPtr"><span class="n"><span class="pre">TensorPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mFlagPtrs</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE" title="Link to this definition">#</a><br /></dt>
|
||||
<dd></dd></dl>
|
||||
|
||||
<dl class="cpp var">
|
||||
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE">
|
||||
<span id="_CPPv3N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE"></span><span id="_CPPv2N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE"></span><span id="tensorrt_llm::runtime::AllReduceBuffers::mIpcMemoryHandles__std::vector:runtime::IpcMemory:"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1AllReduceBuffers_1a162c983f7dc981a8c4af57510637e767"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre"><</span></span><a class="reference internal" href="executor.html#_CPPv4N12tensorrt_llm7runtimeE" title="tensorrt_llm::runtime"><span class="n"><span class="pre">runtime</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime9IpcMemoryE" title="tensorrt_llm::runtime::IpcMemory"><span class="n"><span class="pre">IpcMemory</span></span></a><span class="p"><span class="pre">></span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mIpcMemoryHandles</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE" title="Link to this definition">#</a><br /></dt>
|
||||
@ -12171,8 +12217,9 @@ one more than decoding draft tokens for prediction from primary head </p>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getVocabSizeEv"><code class="docutils literal notranslate"><span class="pre">getVocabSize()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getVocabSizePaddedE10SizeType32"><code class="docutils literal notranslate"><span class="pre">getVocabSizePadded()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">countLocalLayers()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">getFirstLocalLayer()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">countLowerRankLayers()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32"><code class="docutils literal notranslate"><span class="pre">getNbLayers()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">getNbLayers()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">getNbAttentionLayers()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">getNbRnnLayers()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig10getNbHeadsEv"><code class="docutils literal notranslate"><span class="pre">getNbHeads()</span></code></a></li>
|
||||
@ -13526,6 +13573,7 @@ one more than decoding draft tokens for prediction from primary head </p>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsEv"><code class="docutils literal notranslate"><span class="pre">getLogProbs()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsE10SizeType32"><code class="docutils literal notranslate"><span class="pre">getLogProbs()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsEv"><code class="docutils literal notranslate"><span class="pre">getSequenceLengths()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32"><code class="docutils literal notranslate"><span class="pre">getSequenceLengths()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv"><code class="docutils literal notranslate"><span class="pre">getAllNewTokens()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getNextDraftTokensEv"><code class="docutils literal notranslate"><span class="pre">getNextDraftTokens()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getPrevDraftTokensLengthsEv"><code class="docutils literal notranslate"><span class="pre">getPrevDraftTokensLengths()</span></code></a></li>
|
||||
@ -13533,6 +13581,7 @@ one more than decoding draft tokens for prediction from primary head </p>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24getAcceptedLengthsCumSumEv"><code class="docutils literal notranslate"><span class="pre">getAcceptedLengthsCumSum()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getAcceptedPackedPathsEv"><code class="docutils literal notranslate"><span class="pre">getAcceptedPackedPaths()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishedStepsEv"><code class="docutils literal notranslate"><span class="pre">getFinishedSteps()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv"><code class="docutils literal notranslate"><span class="pre">getMaxBatchSize()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv"><code class="docutils literal notranslate"><span class="pre">getMaxBeamWidth()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getMaxSequenceLengthEv"><code class="docutils literal notranslate"><span class="pre">getMaxSequenceLength()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState27getMaxDecodingDecoderTokensEv"><code class="docutils literal notranslate"><span class="pre">getMaxDecodingDecoderTokens()</span></code></a></li>
|
||||
@ -13566,6 +13615,7 @@ one more than decoding draft tokens for prediction from primary head </p>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE"><code class="docutils literal notranslate"><span class="pre">TensorPtr</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb"><code class="docutils literal notranslate"><span class="pre">AllReduceBuffers()</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE"><code class="docutils literal notranslate"><span class="pre">mAllReduceCommPtrs</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE"><code class="docutils literal notranslate"><span class="pre">mFlagPtrs</span></code></a></li>
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE"><code class="docutils literal notranslate"><span class="pre">mIpcMemoryHandles</span></code></a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -13717,6 +13767,15 @@ one more than decoding draft tokens for prediction from primary head </p>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
1081
_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
Normal file
1081
_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,14 +1,17 @@
|
||||
import json
|
||||
import math
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field, fields
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum, EnumMeta
|
||||
from pathlib import Path
|
||||
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
||||
from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Literal, Optional,
|
||||
Union)
|
||||
|
||||
import torch
|
||||
import yaml
|
||||
from pydantic import BaseModel, Field, validator
|
||||
from pydantic import (BaseModel, Field, PrivateAttr, field_validator,
|
||||
model_validator)
|
||||
from strenum import StrEnum
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
@ -17,23 +20,30 @@ from tensorrt_llm.lora_manager import (LoraConfig,
|
||||
|
||||
from .._utils import mpi_rank
|
||||
from ..auto_parallel import AutoParallelConfig, infer_cluster_config
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||
|
||||
# yapf: disable
|
||||
from ..bindings.executor import BatchingType as _BatchingType
|
||||
from ..bindings.executor import \
|
||||
CacheTransceiverConfig as _CacheTransceiverConfig
|
||||
from ..bindings.executor import \
|
||||
CapacitySchedulerPolicy as _CapacitySchedulerPolicy
|
||||
from ..bindings.executor import ContextChunkingPolicy as _ContextChunkingPolicy
|
||||
from ..bindings.executor import DecodingConfig, DecodingMode
|
||||
from ..bindings.executor import DynamicBatchConfig as _DynamicBatchConfig
|
||||
from ..bindings.executor import EagleConfig, ExecutorConfig
|
||||
from ..bindings.executor import \
|
||||
ExtendedRuntimePerfKnobConfig as _ExtendedRuntimePerfKnobConfig
|
||||
from ..bindings.executor import KvCacheConfig as _KvCacheConfig
|
||||
from ..bindings.executor import \
|
||||
LookaheadDecodingConfig as _LookaheadDecodingConfig
|
||||
from ..bindings.executor import PeftCacheConfig as _PeftCacheConfig
|
||||
from ..bindings.executor import SchedulerConfig as _SchedulerConfig
|
||||
# isort: off
|
||||
from ..bindings.executor import (
|
||||
BatchingType as _BatchingType,
|
||||
CacheTransceiverConfig as _CacheTransceiverConfig,
|
||||
CapacitySchedulerPolicy as _CapacitySchedulerPolicy,
|
||||
ContextChunkingPolicy as _ContextChunkingPolicy,
|
||||
DecodingConfig,
|
||||
DecodingMode,
|
||||
DynamicBatchConfig as _DynamicBatchConfig,
|
||||
EagleConfig as _EagleConfig,
|
||||
ExecutorConfig as _ExecutorConfig,
|
||||
ExtendedRuntimePerfKnobConfig as _ExtendedRuntimePerfKnobConfig,
|
||||
KvCacheConfig as _KvCacheConfig,
|
||||
LookaheadDecodingConfig as _LookaheadDecodingConfig,
|
||||
PeftCacheConfig as _PeftCacheConfig,
|
||||
SchedulerConfig as _SchedulerConfig) # isort: skip
|
||||
# isort: on
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
# yapf: enable
|
||||
from ..builder import BuildConfig, EngineConfig
|
||||
from ..logger import logger
|
||||
@ -195,7 +205,8 @@ class DecodingBaseConfig(BaseModel):
|
||||
"MTP": MTPDecodingConfig,
|
||||
"Medusa": MedusaDecodingConfig,
|
||||
"Eagle": EagleDecodingConfig,
|
||||
"Lookahead": LookaheadDecodingConfig
|
||||
"Lookahead": LookaheadDecodingConfig,
|
||||
"NGram": NGramDecodingConfig,
|
||||
}
|
||||
|
||||
config_class = config_classes.get(decoding_type)
|
||||
@ -228,6 +239,7 @@ class EagleDecodingConfig(DecodingBaseConfig):
|
||||
num_eagle_layers: Optional[int] = None
|
||||
max_non_leaves_per_layer: Optional[int] = None
|
||||
pytorch_eagle_weights_path: Optional[str] = None
|
||||
eagle3_one_model: Optional[bool] = True
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict):
|
||||
@ -236,6 +248,40 @@ class EagleDecodingConfig(DecodingBaseConfig):
|
||||
decoding_type: ClassVar[str] = "Eagle"
|
||||
|
||||
|
||||
class NGramDecodingConfig(DecodingBaseConfig):
|
||||
"""
|
||||
Configuration for NGram drafter speculative decoding.
|
||||
|
||||
Arguments:
|
||||
prompt_lookup_num_tokens: int
|
||||
The length maximum of draft tokens (can be understood as length maximum of output draft tokens).
|
||||
|
||||
max_matching_ngram_size: int
|
||||
The length maximum of searching tokens (can be understood as length maximum of input tokens to search).
|
||||
|
||||
is_keep_all: bool = True
|
||||
Whether to keep all candidate pattern-matches pairs, only one match is kept for each pattern if False.
|
||||
|
||||
is_use_oldest: bool = True
|
||||
Whether to provide the oldest match when pattern is hit, the newest one is provided if False.
|
||||
|
||||
is_public_pool: bool = True
|
||||
Whether to use a common pool for all requests, or the pool is private for each request if False.
|
||||
"""
|
||||
|
||||
prompt_lookup_num_tokens: int = 2
|
||||
max_matching_ngram_size: int = 4
|
||||
is_keep_all: bool = True
|
||||
is_use_oldest: bool = True
|
||||
is_public_pool: bool = True
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict):
|
||||
return cls(**data)
|
||||
|
||||
decoding_type: ClassVar[str] = "NGram"
|
||||
|
||||
|
||||
class MTPDecodingConfig(DecodingBaseConfig):
|
||||
num_nextn_predict_layers: Optional[int] = 1
|
||||
use_relaxed_acceptance_for_thinking: Optional[bool] = False
|
||||
@ -512,7 +558,9 @@ class LookaheadDecodingConfig(DecodingBaseConfig, PybindMirror):
|
||||
get_default_lookahead_decoding_verification_set(),
|
||||
description="Number of NGrams in verification branch per step.")
|
||||
|
||||
@validator('max_window_size', 'max_ngram_size', 'max_verification_set_size')
|
||||
@field_validator('max_window_size', 'max_ngram_size',
|
||||
'max_verification_set_size')
|
||||
@classmethod
|
||||
def validate_positive_values(cls, v):
|
||||
if v <= 0:
|
||||
raise ValueError(f"Value must be positive, got {v}")
|
||||
@ -699,7 +747,10 @@ class _ModelWrapper:
|
||||
return self.model if isinstance(self.model, str) else None
|
||||
|
||||
|
||||
class LlmArgs(BaseModel):
|
||||
class BaseLlmArgs(BaseModel):
|
||||
"""
|
||||
Base class for both TorchLlmArgs and TrtLlmArgs. It contains all the arguments that are common to both.
|
||||
"""
|
||||
model_config = {
|
||||
"arbitrary_types_allowed": True,
|
||||
"extra": "allow",
|
||||
@ -771,20 +822,11 @@ class LlmArgs(BaseModel):
|
||||
cp_config: Optional[dict] = Field(default_factory=dict,
|
||||
description="Context parallel config.")
|
||||
|
||||
auto_parallel: bool = Field(default=False,
|
||||
description="Enable auto parallel mode.")
|
||||
|
||||
auto_parallel_world_size: Optional[int] = Field(
|
||||
default=None, description="The world size for auto parallel mode.")
|
||||
|
||||
load_format: Literal['auto', 'dummy'] = Field(
|
||||
default='auto',
|
||||
description="The format to load the model.",
|
||||
json_schema_extra={"type": "Literal['auto', 'dummy']"})
|
||||
|
||||
enable_tqdm: bool = Field(default=False,
|
||||
description="Enable tqdm for progress bar.")
|
||||
|
||||
# LoRA arguments
|
||||
enable_lora: bool = Field(default=False, description="Enable LoRA.")
|
||||
|
||||
@ -816,18 +858,9 @@ class LlmArgs(BaseModel):
|
||||
quant_config: Optional[QuantConfig] = Field(
|
||||
default=None, description="Quantization config.")
|
||||
|
||||
calib_config: Optional[CalibConfig] = Field(
|
||||
default=None, description="Calibration config.")
|
||||
|
||||
# BuildConfig is introduced to give users a familiar interface to configure the model building.
|
||||
build_config: Optional[object] = Field(
|
||||
default=None,
|
||||
description="Build config.",
|
||||
json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
|
||||
|
||||
# Several options from ExecutorConfig, expanded here for less hierarchy
|
||||
kv_cache_config: Optional[KvCacheConfig] = Field(
|
||||
default=None, description="KV cache config.")
|
||||
kv_cache_config: KvCacheConfig = Field(default_factory=KvCacheConfig,
|
||||
description="KV cache config.")
|
||||
|
||||
enable_chunked_prefill: bool = Field(default=False,
|
||||
description="Enable chunked prefill.")
|
||||
@ -850,29 +883,12 @@ class LlmArgs(BaseModel):
|
||||
default=None,
|
||||
description="The maximum number of iterations for request stats.")
|
||||
|
||||
workspace: Optional[str] = Field(default=None,
|
||||
description="The workspace for the model.")
|
||||
|
||||
# A handful of options from PretrainedConfig
|
||||
embedding_parallel_mode: str = Field(
|
||||
default='SHARDING_ALONG_VOCAB',
|
||||
description="The embedding parallel mode.")
|
||||
|
||||
fast_build: bool = Field(default=False, description="Enable fast build.")
|
||||
|
||||
# Once set, the model will reuse the build_cache
|
||||
enable_build_cache: object = Field(
|
||||
default=False,
|
||||
description="Enable build cache.",
|
||||
json_schema_extra={
|
||||
"type": f"Union[{get_type_repr(BuildCacheConfig)}, bool]"
|
||||
})
|
||||
|
||||
peft_cache_config: Optional[PeftCacheConfig] = Field(
|
||||
default=None, description="PEFT cache config.")
|
||||
|
||||
scheduler_config: Optional[SchedulerConfig] = Field(
|
||||
default=None, description="Scheduler config.")
|
||||
scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig,
|
||||
description="Scheduler config.")
|
||||
|
||||
cache_transceiver_config: Optional[CacheTransceiverConfig] = Field(
|
||||
default=None, description="Cache transceiver config.")
|
||||
@ -880,8 +896,8 @@ class LlmArgs(BaseModel):
|
||||
# Speculative decoding parameters
|
||||
speculative_config: Optional[Union[
|
||||
LookaheadDecodingConfig, MedusaDecodingConfig, EagleDecodingConfig,
|
||||
MTPDecodingConfig]] = Field(default=None,
|
||||
description="Speculative decoding config.")
|
||||
MTPDecodingConfig, NGramDecodingConfig]] = Field(
|
||||
default=None, description="Speculative decoding config.")
|
||||
|
||||
batching_type: Optional[BatchingType] = Field(default=None,
|
||||
description="Batching type.")
|
||||
@ -889,13 +905,6 @@ class LlmArgs(BaseModel):
|
||||
normalize_log_probs: bool = Field(
|
||||
default=False, description="Normalize log probabilities.")
|
||||
|
||||
gather_generation_logits: bool = Field(
|
||||
default=False, description="Gather generation logits.")
|
||||
|
||||
extended_runtime_perf_knob_config: Optional[
|
||||
ExtendedRuntimePerfKnobConfig] = Field(
|
||||
default=None, description="Extended runtime perf knob config.")
|
||||
|
||||
max_batch_size: Optional[int] = Field(default=None,
|
||||
description="The maximum batch size.")
|
||||
|
||||
@ -916,6 +925,9 @@ class LlmArgs(BaseModel):
|
||||
description="The backend to use.",
|
||||
exclude=True)
|
||||
|
||||
gather_generation_logits: bool = Field(
|
||||
default=False, description="Gather generation logits.")
|
||||
|
||||
# private fields those are unstable and just for internal use
|
||||
num_postprocess_workers: int = Field(
|
||||
default=0,
|
||||
@ -988,40 +1000,19 @@ class LlmArgs(BaseModel):
|
||||
moe_tp_size=self.moe_tensor_parallel_size,
|
||||
moe_ep_size=self.moe_expert_parallel_size,
|
||||
enable_attention_dp=self.enable_attention_dp,
|
||||
cp_config=self.cp_config,
|
||||
auto_parallel=self.auto_parallel)
|
||||
if self.parallel_config.auto_parallel:
|
||||
self.parallel_config.world_size = self.auto_parallel_world_size
|
||||
|
||||
self.auto_parallel_config = AutoParallelConfig(
|
||||
sharded_io_allowlist=[
|
||||
"past_key_value_\\d+",
|
||||
"present_key_value_\\d*",
|
||||
],
|
||||
same_buffer_io={
|
||||
"past_key_value_(\\d+)": "present_key_value_\\1",
|
||||
},
|
||||
**infer_cluster_config(),
|
||||
)
|
||||
|
||||
self.kv_cache_config = self.kv_cache_config or KvCacheConfig()
|
||||
|
||||
self.scheduler_config = self.scheduler_config or SchedulerConfig()
|
||||
|
||||
# This is used to hold th options for convert_checkpoint
|
||||
self._convert_checkpoint_options = {}
|
||||
cp_config=self.cp_config)
|
||||
|
||||
@classmethod
|
||||
def from_kwargs(cls, **kwargs: Any) -> "LlmArgs":
|
||||
def from_kwargs(cls, **kwargs: Any) -> "BaseLlmArgs":
|
||||
"""Create `LlmArgs` instance from kwargs.
|
||||
|
||||
Args:
|
||||
kwargs (Any): Arguments passed to `LlmArgs` constructor.
|
||||
|
||||
Returns:
|
||||
tensorrt_llm.llmapi.llm_utils.LlmArgs: The `LlmArgs` instance.
|
||||
tensorrt_llm.llmapi.llm_utils.BaseLlmArgs: The `BaseLlmArgs` instance.
|
||||
"""
|
||||
kwargs = LlmArgs._maybe_update_config_for_consistency(dict(kwargs))
|
||||
kwargs = BaseLlmArgs._maybe_update_config_for_consistency(dict(kwargs))
|
||||
ret = cls(**kwargs)
|
||||
ret._setup()
|
||||
return ret
|
||||
@ -1032,8 +1023,7 @@ class LlmArgs(BaseModel):
|
||||
Returns:
|
||||
dict: The dict that contains all fields of the `LlmArgs` instance.
|
||||
"""
|
||||
return dict(
|
||||
(field.name, getattr(self, field.name)) for field in fields(self))
|
||||
return self.model_dump()
|
||||
|
||||
@staticmethod
|
||||
def _maybe_update_config_for_consistency(
|
||||
@ -1041,18 +1031,18 @@ class LlmArgs(BaseModel):
|
||||
# max_beam_width is not included since vague behavior due to lacking the support for dynamic beam width during
|
||||
# generation
|
||||
black_list = set(["max_beam_width"])
|
||||
executor_config_attrs = set(attr for attr in dir(ExecutorConfig)
|
||||
if not attr.startswith('_')
|
||||
and callable(getattr(ExecutorConfig, attr)))
|
||||
executor_config_attrs = set(
|
||||
attr for attr in dir(_ExecutorConfig) if not attr.startswith('_')
|
||||
and callable(getattr(_ExecutorConfig, attr)))
|
||||
executor_config_attrs -= black_list
|
||||
llm_args_attr = set(LlmArgs.model_fields.keys())
|
||||
# NOTE: When cpp ExecutorConfig add new options, please add the new options into `_LlmArgs` with docs as well
|
||||
llm_args_attr = set(BaseLlmArgs.model_fields.keys())
|
||||
# NOTE: When cpp ExecutorConfig add new options, please add the new options into `LlmArgs` with docs as well
|
||||
# ASK chunweiy for help if you are not sure about the new options.
|
||||
assert executor_config_attrs.issubset(
|
||||
llm_args_attr
|
||||
), f"New options found in underlying ExecutorConfig: {llm_args_attr - executor_config_attrs}"
|
||||
|
||||
# ensure build_config and LlmArgs consistency
|
||||
# ensure build_config and LlmArgsBase consistency
|
||||
if kwargs_dict.get("backend") != "pytorch" and kwargs_dict.get(
|
||||
"build_config"):
|
||||
# TODO: move this to _perform_config_arbitration() once it's default-on.
|
||||
@ -1062,11 +1052,11 @@ class LlmArgs(BaseModel):
|
||||
build_val = getattr(kwargs_dict["build_config"], field_name,
|
||||
None)
|
||||
llmargs_val = kwargs_dict.get(
|
||||
field_name) or LlmArgs.model_fields[field_name]
|
||||
field_name) or BaseLlmArgs.model_fields[field_name]
|
||||
|
||||
if build_val != llmargs_val:
|
||||
logger.warning(
|
||||
f"Overriding LlmArgs.{field_name} ({llmargs_val}) with build_config.{field_name} ({build_val})."
|
||||
f"Overriding LlmArgsBase.{field_name} ({llmargs_val}) with build_config.{field_name} ({build_val})."
|
||||
)
|
||||
kwargs_dict[field_name] = build_val
|
||||
|
||||
@ -1075,12 +1065,15 @@ class LlmArgs(BaseModel):
|
||||
def _setup(self):
|
||||
''' This method will setup the configs right before building the model. '''
|
||||
|
||||
is_trt_llm_args = isinstance(self, TrtLlmArgs)
|
||||
|
||||
assert isinstance(self.model,
|
||||
(str, Path)), f"Invalid model: {self.model}"
|
||||
|
||||
self._setup_embedding_parallel_mode()
|
||||
if is_trt_llm_args:
|
||||
self._setup_embedding_parallel_mode()
|
||||
|
||||
if self.enable_build_cache:
|
||||
if is_trt_llm_args and self.enable_build_cache:
|
||||
self.enable_build_cache = BuildCacheConfig() if isinstance(
|
||||
self.enable_build_cache, bool) else self.enable_build_cache
|
||||
if not isinstance(self.enable_build_cache, BuildCacheConfig):
|
||||
@ -1121,7 +1114,8 @@ class LlmArgs(BaseModel):
|
||||
|
||||
self.quant_config = self.quant_config or QuantConfig()
|
||||
|
||||
self.calib_config = self.calib_config or CalibConfig()
|
||||
if is_trt_llm_args:
|
||||
self.calib_config = self.calib_config or CalibConfig()
|
||||
|
||||
# Note: max_batch_size and max_num_tokens in LlmArgs are for runtime,
|
||||
# which will be passed to the C++ Executor API, overwriting the values
|
||||
@ -1148,8 +1142,9 @@ class LlmArgs(BaseModel):
|
||||
self.build_config.max_num_tokens = self.max_num_tokens
|
||||
|
||||
# TODO: remove the checker when manage weights support all data types
|
||||
if self.fast_build and (self.quant_config.quant_algo is QuantAlgo.FP8
|
||||
or self.quant_config.quant_algo is None):
|
||||
if is_trt_llm_args and self.fast_build and (
|
||||
self.quant_config.quant_algo is QuantAlgo.FP8
|
||||
or self.quant_config.quant_algo is None):
|
||||
self._update_plugin_config("manage_weights", True)
|
||||
|
||||
if self.parallel_config._world_size == 1:
|
||||
@ -1162,9 +1157,12 @@ class LlmArgs(BaseModel):
|
||||
if self.max_lora_rank is not None:
|
||||
self.build_config.lora_config.max_lora_rank = self.max_lora_rank
|
||||
|
||||
self._setup_speculative_config()
|
||||
|
||||
if self.enable_prompt_adapter:
|
||||
self.build_config.max_prompt_embedding_table_size = self.max_prompt_adapter_token * self.build_config.max_batch_size
|
||||
|
||||
def _setup_speculative_config(self):
|
||||
if self.speculative_config:
|
||||
if isinstance(self.speculative_config, LookaheadDecodingConfig):
|
||||
lookahead_config = self.speculative_config
|
||||
@ -1194,7 +1192,7 @@ class LlmArgs(BaseModel):
|
||||
self.build_config.max_draft_len = self.speculative_config.max_draft_len
|
||||
|
||||
if self.backend != 'pytorch':
|
||||
eagle_config = EagleConfig(
|
||||
eagle_config = _EagleConfig(
|
||||
self.speculative_config.eagle_choices,
|
||||
self.speculative_config.greedy_sampling,
|
||||
self.speculative_config.posterior_threshold,
|
||||
@ -1207,9 +1205,25 @@ class LlmArgs(BaseModel):
|
||||
from tensorrt_llm._torch.speculative import Eagle3Config
|
||||
self.speculative_config = Eagle3Config(
|
||||
max_draft_tokens=self.speculative_config.max_draft_len,
|
||||
eagle_weights_path=self.speculative_config.
|
||||
pytorch_eagle_weights_path)
|
||||
|
||||
draft_model_path=self.speculative_config.
|
||||
pytorch_eagle_weights_path,
|
||||
eagle3_one_model=self.speculative_config.
|
||||
eagle3_one_model)
|
||||
elif isinstance(self.speculative_config, NGramDecodingConfig):
|
||||
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.NGRAM
|
||||
assert self.backend == 'pytorch'
|
||||
assert self.speculative_config.prompt_lookup_num_tokens > 0 and self.speculative_config.max_matching_ngram_size > 0
|
||||
self.build_config.max_draft_len = self.speculative_config.max_draft_len
|
||||
from tensorrt_llm._torch.speculative import NGramConfig
|
||||
self.speculative_config = NGramConfig(
|
||||
prompt_lookup_num_tokens=self.speculative_config.
|
||||
prompt_lookup_num_tokens,
|
||||
max_matching_ngram_size=self.speculative_config.
|
||||
max_matching_ngram_size,
|
||||
is_keep_all=self.speculative_config.is_keep_all,
|
||||
is_use_oldest=self.speculative_config.is_use_oldest,
|
||||
is_public_pool=self.speculative_config.is_public_pool,
|
||||
)
|
||||
elif isinstance(self.speculative_config, MTPDecodingConfig):
|
||||
from tensorrt_llm._torch.speculative import MTPConfig
|
||||
self.speculative_config = MTPConfig(
|
||||
@ -1350,32 +1364,385 @@ class LlmArgs(BaseModel):
|
||||
f"Invalid embedding_parallel_mode: {self.llm_args.embedding_parallel_mode}"
|
||||
)
|
||||
|
||||
def _validate_kv_cache_config(self):
|
||||
if self.kv_cache_config is None:
|
||||
raise ValueError("KvCacheConfig is required for streaming LLM.")
|
||||
|
||||
if self.kv_cache_config.max_attention_window is None:
|
||||
raise ValueError(
|
||||
"KvCacheConfig.max_attention_window should be set for streaming LLM."
|
||||
)
|
||||
if any(i <= 0 for i in self.kv_cache_config.max_attention_window):
|
||||
raise ValueError(
|
||||
"Elements in KvCacheConfig.max_attention_window should be greater than 0."
|
||||
)
|
||||
class TrtLlmArgs(BaseLlmArgs):
|
||||
|
||||
if self.kv_cache_config.sink_token_length is None:
|
||||
raise ValueError(
|
||||
"KvCacheConfig.sink_token_length should be set for streaming LLM."
|
||||
)
|
||||
if self.kv_cache_config.sink_token_length <= 0:
|
||||
raise ValueError(
|
||||
"KvCacheConfig.sink_token_length should be greater than 0.")
|
||||
auto_parallel: bool = Field(
|
||||
default=False,
|
||||
description="Enable auto parallel mode.",
|
||||
deprecated=
|
||||
"Use tensor_parallel_size/pipeline_parallel_size/xxx_parallel_size instead.",
|
||||
)
|
||||
|
||||
auto_parallel_world_size: Optional[int] = Field(
|
||||
default=None,
|
||||
description="The world size for auto parallel mode.",
|
||||
deprecated=
|
||||
"Use tensor_parallel_size/pipeline_parallel_size/xxx_parallel_size instead.",
|
||||
)
|
||||
|
||||
enable_tqdm: bool = Field(default=False,
|
||||
description="Enable tqdm for progress bar.")
|
||||
|
||||
# BuildConfig is introduced to give users a familiar interface to configure the model building.
|
||||
build_config: Optional[object] = Field(
|
||||
default=None,
|
||||
description="Build config.",
|
||||
json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
|
||||
|
||||
workspace: Optional[str] = Field(default=None,
|
||||
description="The workspace for the model.")
|
||||
|
||||
# Once set, the model will reuse the build_cache
|
||||
enable_build_cache: object = Field(
|
||||
default=False,
|
||||
description="Enable build cache.",
|
||||
json_schema_extra={
|
||||
"type": f"Union[{get_type_repr(BuildCacheConfig)}, bool]"
|
||||
})
|
||||
|
||||
extended_runtime_perf_knob_config: Optional[
|
||||
ExtendedRuntimePerfKnobConfig] = Field(
|
||||
default=None, description="Extended runtime perf knob config.")
|
||||
|
||||
calib_config: Optional[CalibConfig] = Field(
|
||||
default=None, description="Calibration config.")
|
||||
|
||||
embedding_parallel_mode: str = Field(
|
||||
default='SHARDING_ALONG_VOCAB',
|
||||
description="The embedding parallel mode.")
|
||||
|
||||
fast_build: bool = Field(default=False, description="Enable fast build.")
|
||||
|
||||
# Private attributes
|
||||
_auto_parallel_config: Optional[AutoParallelConfig] = PrivateAttr(
|
||||
default=None)
|
||||
# This is used to hold the options for convert_checkpoint
|
||||
_convert_checkpoint_options: Dict[str,
|
||||
Any] = PrivateAttr(default_factory=dict)
|
||||
|
||||
@property
|
||||
def auto_parallel_config(self) -> AutoParallelConfig:
|
||||
return self._auto_parallel_config
|
||||
|
||||
@print_traceback_on_error
|
||||
def model_post_init(self, __context):
|
||||
super().model_post_init(__context)
|
||||
|
||||
self._auto_parallel_config = AutoParallelConfig(
|
||||
sharded_io_allowlist=[
|
||||
"past_key_value_\\d+",
|
||||
"present_key_value_\\d*",
|
||||
],
|
||||
same_buffer_io={
|
||||
"past_key_value_(\\d+)": "present_key_value_\\1",
|
||||
},
|
||||
**infer_cluster_config(),
|
||||
)
|
||||
|
||||
self.parallel_config.auto_parallel = self.auto_parallel
|
||||
|
||||
if self.parallel_config.auto_parallel:
|
||||
self.parallel_config.world_size = self.auto_parallel_world_size
|
||||
|
||||
|
||||
LlmArgs = TrtLlmArgs
|
||||
|
||||
LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(LlmArgs,
|
||||
indent=' ' * 4)
|
||||
|
||||
|
||||
class LoadFormat(Enum):
|
||||
AUTO = 0
|
||||
# Initialize all weights randomly.
|
||||
DUMMY = 1
|
||||
|
||||
|
||||
class TorchLlmArgs(BaseLlmArgs):
|
||||
|
||||
# Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs
|
||||
build_config: Optional[object] = Field(
|
||||
default=None,
|
||||
description="Build config.",
|
||||
exclude_from_json=True,
|
||||
json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
|
||||
|
||||
# PyTorch backend specific configurations
|
||||
|
||||
use_cuda_graph: bool = Field(
|
||||
default=False,
|
||||
description=
|
||||
"If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_batch_sizes, and are enabled for batches that consist of decoding requests *only* (the reason is that it's hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory."
|
||||
)
|
||||
|
||||
cuda_graph_batch_sizes: Optional[List[int]] = Field(
|
||||
default=None,
|
||||
description="List of batch sizes to create CUDA graphs for.")
|
||||
|
||||
cuda_graph_max_batch_size: int = Field(
|
||||
default=0, description="Maximum batch size for CUDA graphs.")
|
||||
|
||||
cuda_graph_padding_enabled: bool = Field(
|
||||
default=False,
|
||||
description=
|
||||
"If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."
|
||||
)
|
||||
|
||||
disable_overlap_scheduler: bool = Field(
|
||||
default=False, description="Disable the overlap scheduler.")
|
||||
|
||||
moe_max_num_tokens: Optional[int] = Field(
|
||||
default=None,
|
||||
description=
|
||||
"If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used."
|
||||
)
|
||||
|
||||
moe_load_balancer: Optional[Union[object, str]] = Field(
|
||||
default=None,
|
||||
description="Configuration for MoE load balancing.",
|
||||
json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
|
||||
|
||||
attn_backend: str = Field(default='TRTLLM',
|
||||
description="Attention backend to use.")
|
||||
|
||||
moe_backend: str = Field(default='CUTLASS',
|
||||
description="MoE backend to use.")
|
||||
|
||||
mixed_sampler: bool = Field(
|
||||
default=False,
|
||||
description=
|
||||
"If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."
|
||||
)
|
||||
|
||||
enable_trtllm_sampler: bool = Field(
|
||||
default=False,
|
||||
description=
|
||||
"If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies."
|
||||
)
|
||||
|
||||
kv_cache_dtype: str = Field(default="auto",
|
||||
description="Data type for KV cache.")
|
||||
|
||||
use_kv_cache: bool = Field(default=True,
|
||||
description="Whether to use KV cache.")
|
||||
|
||||
enable_iter_perf_stats: bool = Field(
|
||||
default=False, description="Enable iteration performance statistics.")
|
||||
|
||||
enable_iter_req_stats: bool = Field(
|
||||
default=False,
|
||||
description=
|
||||
"If true, enables per request stats per iteration. Must also set enable_iter_perf_stats to true to get request stats."
|
||||
)
|
||||
|
||||
print_iter_log: bool = Field(default=False,
|
||||
description="Print iteration logs.")
|
||||
|
||||
torch_compile_enabled: bool = Field(
|
||||
default=False, description="Enable torch.compile optimization.")
|
||||
|
||||
torch_compile_fullgraph: bool = Field(
|
||||
default=True,
|
||||
description="Enable full graph compilation in torch.compile.")
|
||||
|
||||
torch_compile_inductor_enabled: bool = Field(
|
||||
default=False, description="Enable inductor backend in torch.compile.")
|
||||
|
||||
torch_compile_piecewise_cuda_graph: bool = Field(
|
||||
default=False,
|
||||
description="Enable piecewise CUDA graph in torch.compile.")
|
||||
|
||||
torch_compile_enable_userbuffers: bool = Field(
|
||||
default=True,
|
||||
description=
|
||||
"When torch compile is enabled, userbuffers is enabled by default.")
|
||||
|
||||
autotuner_enabled: bool = Field(
|
||||
default=True,
|
||||
description="Enable autotuner only when torch compile is enabled.")
|
||||
|
||||
enable_layerwise_nvtx_marker: bool = Field(
|
||||
default=False, description="If true, enable layerwise nvtx marker.")
|
||||
|
||||
auto_deploy_config: Optional[object] = Field(
|
||||
default=None,
|
||||
description="Auto deploy config.",
|
||||
exclude_from_json=True,
|
||||
json_schema_extra={"type": f"Optional[AutoDeployConfig]"})
|
||||
|
||||
load_format: Union[str, LoadFormat] = Field(
|
||||
default=LoadFormat.AUTO,
|
||||
description=
|
||||
"How to load the model weights. By default, detect the weight type from the model checkpoint."
|
||||
)
|
||||
|
||||
enable_min_latency: bool = Field(
|
||||
default=False,
|
||||
description=
|
||||
"If true, enable min-latency mode. Currently only used for Llama4.",
|
||||
)
|
||||
|
||||
@field_validator('load_format', mode='before')
|
||||
@classmethod
|
||||
def convert_load_format(cls, v):
|
||||
if isinstance(v, LoadFormat):
|
||||
return v
|
||||
load_format = v.upper()
|
||||
if load_format not in LoadFormat.__members__:
|
||||
raise ValueError(f"Invalid LoadFormat: {v}")
|
||||
return LoadFormat[load_format]
|
||||
|
||||
# Extra resource managers to use in addition to the KV cache manager.
|
||||
# Each manager's prepare_resources method is called before the forward pass,
|
||||
# and update_resources() is called after the pass finishes. free_resources()
|
||||
# is called when a request finishes. The KV cache manager is guaranteed to
|
||||
# be invoked after all of these extra managers in all stages.
|
||||
_extra_resource_managers: Dict[str,
|
||||
object] = PrivateAttr(default_factory=dict, )
|
||||
|
||||
@property
|
||||
def extra_resource_managers(self) -> Dict[str, object]:
|
||||
return self._extra_resource_managers
|
||||
|
||||
@extra_resource_managers.setter
|
||||
def extra_resource_managers(self, value: Dict[str, object]) -> None:
|
||||
self._extra_resource_managers = value
|
||||
|
||||
@print_traceback_on_error
|
||||
def model_post_init(self, __context):
|
||||
from .._torch.model_config import MoeLoadBalancerConfig
|
||||
|
||||
super().model_post_init(__context)
|
||||
self.model_format = _ModelFormatKind.HF
|
||||
|
||||
if isinstance(self.moe_load_balancer, str):
|
||||
if not os.path.exists(self.moe_load_balancer):
|
||||
raise FileNotFoundError(
|
||||
f"MoE load balancer config file not found: {self.moe_load_balancer}"
|
||||
)
|
||||
try:
|
||||
with open(self.moe_load_balancer) as f:
|
||||
moe_load_balancer_config = yaml.safe_load(f)
|
||||
self.moe_load_balancer = MoeLoadBalancerConfig(
|
||||
**moe_load_balancer_config)
|
||||
except Exception as e:
|
||||
raise ValueError(
|
||||
f"Failed to load MoE load balancer config file: {self.moe_load_balancer}"
|
||||
) from e
|
||||
|
||||
# TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
|
||||
def get_pytorch_backend_config(self) -> "PyTorchConfig":
|
||||
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||
|
||||
# TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
|
||||
# Just a WAR to support the auto_deploy
|
||||
if self.auto_deploy_config is not None:
|
||||
return self.auto_deploy_config
|
||||
|
||||
return PyTorchConfig(
|
||||
extra_resource_managers=self.extra_resource_managers,
|
||||
use_cuda_graph=self.use_cuda_graph,
|
||||
cuda_graph_batch_sizes=self.cuda_graph_batch_sizes,
|
||||
cuda_graph_max_batch_size=self.cuda_graph_max_batch_size,
|
||||
cuda_graph_padding_enabled=self.cuda_graph_padding_enabled,
|
||||
disable_overlap_scheduler=self.disable_overlap_scheduler,
|
||||
moe_max_num_tokens=self.moe_max_num_tokens,
|
||||
moe_load_balancer=self.moe_load_balancer,
|
||||
attn_backend=self.attn_backend,
|
||||
moe_backend=self.moe_backend,
|
||||
mixed_sampler=self.mixed_sampler,
|
||||
enable_trtllm_sampler=self.enable_trtllm_sampler,
|
||||
kv_cache_dtype=self.kv_cache_dtype,
|
||||
use_kv_cache=self.use_kv_cache,
|
||||
enable_iter_perf_stats=self.enable_iter_perf_stats,
|
||||
enable_iter_req_stats=self.enable_iter_req_stats,
|
||||
print_iter_log=self.print_iter_log,
|
||||
torch_compile_enabled=self.torch_compile_enabled,
|
||||
torch_compile_fullgraph=self.torch_compile_fullgraph,
|
||||
torch_compile_inductor_enabled=self.torch_compile_inductor_enabled,
|
||||
torch_compile_piecewise_cuda_graph=self.
|
||||
torch_compile_piecewise_cuda_graph,
|
||||
torch_compile_enable_userbuffers=self.
|
||||
torch_compile_enable_userbuffers,
|
||||
autotuner_enabled=self.autotuner_enabled,
|
||||
enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker,
|
||||
load_format=self.load_format,
|
||||
enable_min_latency=self.enable_min_latency)
|
||||
|
||||
@field_validator('cuda_graph_max_batch_size')
|
||||
@classmethod
|
||||
def validate_cuda_graph_max_batch_size(cls, v):
|
||||
"""Validate cuda_graph_max_batch_size is non-negative."""
|
||||
if v < 0:
|
||||
raise ValueError("cuda_graph_max_batch_size must be non-negative")
|
||||
return v
|
||||
|
||||
@staticmethod
|
||||
def _generate_cuda_graph_batch_sizes(max_batch_size: int,
|
||||
padding_enabled: bool) -> List[int]:
|
||||
"""Generate a list of batch sizes for CUDA graphs.
|
||||
|
||||
Args:
|
||||
max_batch_size: Maximum batch size to generate up to
|
||||
padding_enabled: Whether padding is enabled, which affects the batch size distribution
|
||||
|
||||
Returns:
|
||||
List of batch sizes to create CUDA graphs for
|
||||
"""
|
||||
if padding_enabled:
|
||||
batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)]
|
||||
else:
|
||||
batch_sizes = list(range(1, 32)) + [32, 64, 128]
|
||||
|
||||
# Add powers of 2 up to max_batch_size
|
||||
batch_sizes += [
|
||||
2**i for i in range(8, math.floor(math.log(max_batch_size, 2)))
|
||||
]
|
||||
|
||||
# Filter and sort batch sizes
|
||||
batch_sizes = sorted(
|
||||
[size for size in batch_sizes if size <= max_batch_size])
|
||||
|
||||
# Add max_batch_size if not already included
|
||||
if max_batch_size != batch_sizes[-1]:
|
||||
batch_sizes.append(max_batch_size)
|
||||
|
||||
return batch_sizes
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
|
||||
"""Validate CUDA graph configuration.
|
||||
|
||||
Ensures that:
|
||||
1. If cuda_graph_batch_sizes is provided, cuda_graph_max_batch_size must be 0
|
||||
2. If cuda_graph_batch_sizes is not provided, it is generated based on cuda_graph_max_batch_size
|
||||
3. If both are provided, cuda_graph_batch_sizes must match the generated values
|
||||
"""
|
||||
if self.cuda_graph_batch_sizes is not None:
|
||||
self.cuda_graph_batch_sizes = sorted(self.cuda_graph_batch_sizes)
|
||||
if self.cuda_graph_max_batch_size != 0:
|
||||
if self.cuda_graph_batch_sizes != self._generate_cuda_graph_batch_sizes(
|
||||
self.cuda_graph_max_batch_size,
|
||||
self.cuda_graph_padding_enabled):
|
||||
raise ValueError(
|
||||
"Please don't set both cuda_graph_batch_sizes "
|
||||
"and cuda_graph_max_batch_size.\n"
|
||||
f"cuda_graph_batch_sizes: {self.cuda_graph_batch_sizes}, "
|
||||
f"cuda_graph_max_batch_size: {self.cuda_graph_max_batch_size}"
|
||||
)
|
||||
else:
|
||||
self.cuda_graph_max_batch_size = max(
|
||||
self.cuda_graph_batch_sizes)
|
||||
else:
|
||||
max_batch_size = self.cuda_graph_max_batch_size or 128
|
||||
generated_sizes = self._generate_cuda_graph_batch_sizes(
|
||||
max_batch_size, self.cuda_graph_padding_enabled)
|
||||
self.cuda_graph_batch_sizes = generated_sizes
|
||||
self.cuda_graph_max_batch_size = max_batch_size
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def update_llm_args_with_extra_dict(
|
||||
llm_args: Dict,
|
||||
llm_args_dict: Dict,
|
||||
|
||||
BIN
_images/8x_l20_L40S_node_architecture.png
Normal file
BIN
_images/8x_l20_L40S_node_architecture.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 261 KiB |
BIN
_images/tech_blog3_mla_absorb.png
Normal file
BIN
_images/tech_blog3_mla_absorb.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 548 KiB |
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -680,6 +684,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1986,6 +1990,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -661,6 +665,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -760,6 +764,10 @@
|
||||
<span class="n">output</span><span class="o">.</span><span class="n">finish_reason</span> <span class="o">=</span> <span class="s1">'length'</span>
|
||||
<span class="k">elif</span> <span class="n">finish_reasons</span><span class="p">[</span><span class="n">src_idx</span><span class="p">]</span> <span class="o">==</span> <span class="n">tllm</span><span class="o">.</span><span class="n">FinishReason</span><span class="o">.</span><span class="n">TIMED_OUT</span><span class="p">:</span>
|
||||
<span class="n">output</span><span class="o">.</span><span class="n">finish_reason</span> <span class="o">=</span> <span class="s1">'timeout'</span>
|
||||
<span class="c1"># For disaggregated serving, finish reason might be NOT_FINISHED which is ok</span>
|
||||
<span class="k">elif</span> <span class="n">finish_reasons</span><span class="p">[</span>
|
||||
<span class="n">src_idx</span><span class="p">]</span> <span class="o">==</span> <span class="n">tllm</span><span class="o">.</span><span class="n">FinishReason</span><span class="o">.</span><span class="n">NOT_FINISHED</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">disaggregated_params</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">disaggregated_params</span><span class="o">.</span><span class="n">request_type</span> <span class="o">==</span> <span class="s2">"context_only"</span><span class="p">:</span>
|
||||
<span class="n">output</span><span class="o">.</span><span class="n">finish_reason</span> <span class="o">=</span> <span class="s1">'not_finished'</span>
|
||||
<span class="k">elif</span> <span class="n">finish_reasons</span><span class="p">[</span><span class="n">src_idx</span><span class="p">]</span> <span class="o">==</span> <span class="n">tllm</span><span class="o">.</span><span class="n">FinishReason</span><span class="o">.</span><span class="n">CANCELLED</span><span class="p">:</span>
|
||||
<span class="k">pass</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
@ -1262,6 +1270,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -510,6 +514,8 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">queue</span><span class="w"> </span><span class="kn">import</span> <span class="n">Empty</span><span class="p">,</span> <span class="n">Queue</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">NamedTuple</span><span class="p">,</span> <span class="n">Optional</span>
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">strenum</span><span class="w"> </span><span class="kn">import</span> <span class="n">StrEnum</span>
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">mpi_rank</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Response</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">print_colored_debug</span>
|
||||
@ -519,18 +525,35 @@
|
||||
<span class="n">RemoteMpiCommSessionClient</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..llmapi.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">print_colored_debug</span>
|
||||
|
||||
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">LlmLauncherEnvs</span><span class="p">(</span><span class="n">StrEnum</span><span class="p">):</span>
|
||||
<span class="c1"># Spawn a process for the LLM-API Proxy</span>
|
||||
<span class="n">TLLM_SPAWN_PROXY_PROCESS</span> <span class="o">=</span> <span class="s2">"TLLM_SPAWN_PROXY_PROCESS"</span>
|
||||
<span class="n">TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR</span> <span class="o">=</span> <span class="s2">"TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR"</span>
|
||||
<span class="n">TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY</span> <span class="o">=</span> <span class="s2">"TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY"</span>
|
||||
|
||||
<span class="c1"># Whether to use periodical responses handler in await_responses</span>
|
||||
<span class="n">TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT</span> <span class="o">=</span> <span class="s2">"TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT"</span>
|
||||
|
||||
|
||||
<span class="n">PERIODICAL_RESP_IN_AWAIT</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span>
|
||||
<span class="s2">"TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1"</span>
|
||||
<span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1"</span>
|
||||
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">get_spawn_proxy_process_ipc_addr_env</span><span class="p">()</span> <span class="o">-></span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">''' Get the IPC address for the spawn proxy process dynamically. '''</span>
|
||||
<span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">"TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR"</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR</span><span class="p">)</span>
|
||||
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">get_spawn_proxy_process_ipc_hmac_key_env</span><span class="p">()</span> <span class="o">-></span> <span class="nb">bytes</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">''' Get the HMAC key for the spawn proxy process dynamically. '''</span>
|
||||
<span class="k">if</span> <span class="n">key</span> <span class="o">:=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">"TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY"</span><span class="p">):</span>
|
||||
<span class="k">return</span> <span class="nb">bytes</span><span class="o">.</span><span class="n">fromhex</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
|
||||
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">get_spawn_proxy_process_env</span><span class="p">()</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">''' Get the environment variable for the spawn proxy process dynamically. '''</span>
|
||||
<span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">"TLLM_SPAWN_PROXY_PROCESS"</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1"</span>
|
||||
<span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_SPAWN_PROXY_PROCESS</span><span class="p">)</span> <span class="o">==</span> <span class="s2">"1"</span>
|
||||
|
||||
|
||||
<span class="k">if</span> <span class="n">PERIODICAL_RESP_IN_AWAIT</span><span class="p">:</span>
|
||||
@ -543,14 +566,11 @@
|
||||
<span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">"create_mpi_comm_session must be called by rank 0, but it was called by rank </span><span class="si">{</span><span class="n">mpi_rank</span><span class="p">()</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="k">if</span> <span class="n">get_spawn_proxy_process_env</span><span class="p">():</span>
|
||||
<span class="k">assert</span> <span class="n">get_spawn_proxy_process_ipc_addr_env</span><span class="p">(</span>
|
||||
<span class="p">),</span> <span class="s2">"TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR is not set."</span>
|
||||
<span class="p">),</span> <span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR</span><span class="si">}</span><span class="s2"> is not set."</span>
|
||||
<span class="n">print_colored_debug</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Using RemoteMpiPoolSessionClient to bind to external MPI processes at </span><span class="si">{</span><span class="n">get_spawn_proxy_process_ipc_addr_env</span><span class="p">()</span><span class="si">}</span><span class="se">\n</span><span class="s2">"</span><span class="p">,</span>
|
||||
<span class="s2">"yellow"</span><span class="p">)</span>
|
||||
<span class="n">hmac_key</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">"TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY"</span><span class="p">)</span>
|
||||
<span class="c1"># Convert the hex string to bytes</span>
|
||||
<span class="k">if</span> <span class="n">hmac_key</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">hmac_key</span> <span class="o">=</span> <span class="nb">bytes</span><span class="o">.</span><span class="n">fromhex</span><span class="p">(</span><span class="n">hmac_key</span><span class="p">)</span>
|
||||
<span class="n">hmac_key</span> <span class="o">=</span> <span class="n">get_spawn_proxy_process_ipc_hmac_key_env</span><span class="p">()</span>
|
||||
<span class="k">return</span> <span class="n">RemoteMpiCommSessionClient</span><span class="p">(</span>
|
||||
<span class="n">addr</span><span class="o">=</span><span class="n">get_spawn_proxy_process_ipc_addr_env</span><span class="p">(),</span> <span class="n">hmac_key</span><span class="o">=</span><span class="n">hmac_key</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
@ -758,6 +778,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -4727,7 +4731,8 @@
|
||||
<span class="n">UB</span> <span class="o">=</span> <span class="mi">2</span>
|
||||
<span class="n">AUTO</span> <span class="o">=</span> <span class="mi">3</span>
|
||||
<span class="n">ONESHOT</span> <span class="o">=</span> <span class="mi">4</span>
|
||||
<span class="n">TWOSHOT</span> <span class="o">=</span> <span class="mi">5</span></div>
|
||||
<span class="n">TWOSHOT</span> <span class="o">=</span> <span class="mi">5</span>
|
||||
<span class="n">LOWPRECISION</span> <span class="o">=</span> <span class="mi">6</span></div>
|
||||
|
||||
|
||||
|
||||
@ -8673,6 +8678,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -639,6 +643,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -3504,6 +3508,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -646,6 +650,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -895,6 +899,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1362,6 +1366,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1210,6 +1214,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1236,6 +1240,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1000,6 +1004,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -655,6 +659,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -939,6 +943,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -515,6 +519,7 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tqdm</span><span class="w"> </span><span class="kn">import</span> <span class="n">tqdm</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">PreTrainedTokenizerBase</span>
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.builder</span><span class="w"> </span><span class="kn">import</span> <span class="n">BuildConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.data</span><span class="w"> </span><span class="kn">import</span> <span class="n">TextPrompt</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.registry</span><span class="w"> </span><span class="kn">import</span> <span class="n">DefaultInputProcessor</span>
|
||||
|
||||
@ -532,8 +537,9 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..inputs</span><span class="w"> </span><span class="kn">import</span> <span class="n">PromptInputs</span><span class="p">,</span> <span class="n">create_input_processor</span><span class="p">,</span> <span class="n">prompt_inputs</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..sampling_params</span><span class="w"> </span><span class="kn">import</span> <span class="n">SamplingParams</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_args</span><span class="w"> </span><span class="kn">import</span> <span class="n">LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span> <span class="n">PybindMirror</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_utils</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">CachedModelLoader</span><span class="p">,</span> <span class="n">KvCacheRetentionConfig</span><span class="p">,</span> <span class="n">LlmArgs</span><span class="p">,</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_args</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span> <span class="n">PybindMirror</span><span class="p">,</span> <span class="n">TorchLlmArgs</span><span class="p">,</span>
|
||||
<span class="n">TrtLlmArgs</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_utils</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">CachedModelLoader</span><span class="p">,</span> <span class="n">KvCacheRetentionConfig</span><span class="p">,</span>
|
||||
<span class="n">LlmBuildStats</span><span class="p">,</span> <span class="n">ModelLoader</span><span class="p">,</span> <span class="n">_ModelRuntimeContext</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.mpi_session</span><span class="w"> </span><span class="kn">import</span> <span class="n">MpiPoolSession</span><span class="p">,</span> <span class="n">external_mpi_comm_available</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.tokenizer</span><span class="w"> </span><span class="kn">import</span> <span class="n">TokenizerBase</span><span class="p">,</span> <span class="n">_xgrammar_tokenizer_info</span>
|
||||
@ -625,9 +631,10 @@
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_executor_cls</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s2">"executor_cls"</span><span class="p">,</span> <span class="n">GenerationExecutor</span><span class="p">)</span>
|
||||
|
||||
<span class="k">try</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">pytorch_backend_config</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">'pytorch_backend_config'</span><span class="p">,</span>
|
||||
<span class="kc">None</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">LlmArgs</span><span class="o">.</span><span class="n">from_kwargs</span><span class="p">(</span>
|
||||
<span class="n">llm_args_cls</span> <span class="o">=</span> <span class="n">TorchLlmArgs</span> <span class="k">if</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="s1">'backend'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> <span class="o">==</span> <span class="s1">'pytorch'</span> <span class="k">else</span> <span class="n">TrtLlmArgs</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">llm_args_cls</span><span class="o">.</span><span class="n">from_kwargs</span><span class="p">(</span>
|
||||
<span class="n">model</span><span class="o">=</span><span class="n">model</span><span class="p">,</span>
|
||||
<span class="n">tokenizer</span><span class="o">=</span><span class="n">tokenizer</span><span class="p">,</span>
|
||||
<span class="n">tokenizer_mode</span><span class="o">=</span><span class="n">tokenizer_mode</span><span class="p">,</span>
|
||||
@ -675,8 +682,9 @@
|
||||
<span class="c1"># Due to the Executor can only accept a engine path, we need to save the engine to a directory</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Path</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">GenerationExecutor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">TemporaryDirectory</span><span class="p">(</span>
|
||||
<span class="n">suffix</span><span class="o">=</span><span class="s2">"-llm-workspace"</span><span class="p">,</span> <span class="nb">dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">workspace</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">TemporaryDirectory</span><span class="p">(</span>
|
||||
<span class="n">suffix</span><span class="o">=</span><span class="s2">"-llm-workspace"</span><span class="p">,</span> <span class="nb">dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">workspace</span><span class="p">)</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_hf_model_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Path</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
|
||||
@ -696,7 +704,7 @@
|
||||
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">workspace</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Path</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">Path</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="n">Path</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="kc">None</span>
|
||||
|
||||
<div class="viewcode-block" id="LLM.generate">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.generate">[docs]</a>
|
||||
@ -808,10 +816,13 @@
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">sampling_params</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_sampling_params</span><span class="p">(</span><span class="n">sampling_params</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">></span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"SamplingParams.n (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">) should not exceed max_batch_size (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">)"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="c1"># With pytorch backend, py_executor has logic to handle max_tokens of 1,</span>
|
||||
<span class="c1"># so set to 1 to avoid allocating unnecessary KV cache blocks for single request</span>
|
||||
<span class="c1"># TODO: Also support for trt backend</span>
|
||||
<span class="k">if</span> <span class="p">(</span><span class="n">disaggregated_params</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
|
||||
<span class="ow">and</span> <span class="n">disaggregated_params</span><span class="o">.</span><span class="n">request_type</span> <span class="o">==</span> <span class="s2">"context_only"</span>
|
||||
<span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span><span class="p">):</span>
|
||||
<span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
|
||||
<span class="n">inputs</span> <span class="o">=</span> <span class="n">prompt_inputs</span><span class="p">(</span><span class="n">inputs</span><span class="p">)</span>
|
||||
|
||||
@ -839,8 +850,9 @@
|
||||
<span class="n">prompt</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">query_token_ids</span> <span class="o">=</span> <span class="n">inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"query_token_ids"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="s2">"prompt"</span> <span class="ow">in</span> <span class="n">inputs</span><span class="p">:</span>
|
||||
<span class="n">prompt_token_ids</span><span class="p">,</span> <span class="n">extra_processed_inputs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="p">(</span>
|
||||
<span class="n">inputs</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
|
||||
<span class="k">with</span> <span class="n">nvtx_range_debug</span><span class="p">(</span><span class="s2">"input_processor"</span><span class="p">):</span>
|
||||
<span class="n">prompt_token_ids</span><span class="p">,</span> <span class="n">extra_processed_inputs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="p">(</span>
|
||||
<span class="n">inputs</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
|
||||
<span class="n">prompt</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="s1">'prompt'</span><span class="p">]</span>
|
||||
<span class="k">if</span> <span class="n">extra_processed_inputs</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">query_token_ids</span> <span class="o">=</span> <span class="n">extra_processed_inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'query_token_ids'</span><span class="p">)</span>
|
||||
@ -1025,10 +1037,28 @@
|
||||
<span class="sa">f</span><span class="s2">"The sum of prompt length (</span><span class="si">{</span><span class="n">prompt_len</span><span class="o">/</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">) and query length (</span><span class="si">{</span><span class="n">query_len</span><span class="si">}</span><span class="s2">) max_tokens (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span><span class="si">}</span><span class="s2">) should not exceed "</span>
|
||||
<span class="sa">f</span><span class="s2">"max_seq_len (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">)"</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">></span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"sampling_params's n (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">) should not exceed max_beam_width (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">) when use_beam_search is True"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span> <span class="o">></span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">==</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"sampling_params.n (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">) cannot exceed max_beam_width (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">) when use_beam_search is True"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"sampling_params.best_of (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s2">) cannot exceed max_beam_width (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">) when use_beam_search is True"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="k">if</span> <span class="n">max_batch_size</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span> <span class="o">></span> <span class="n">max_batch_size</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">==</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"sampling_params.n (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">) cannot exceed max_batch_size (</span><span class="si">{</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">) when use_beam_search is False"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"sampling_params.best_of (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s2">) cannot exceed max_batch_size (</span><span class="si">{</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">) when use_beam_search is False"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">build_config</span><span class="o">.</span><span class="n">gather_context_logits</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
@ -1064,11 +1094,19 @@
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="o">.</span><span class="n">tokenizer</span>
|
||||
|
||||
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span>
|
||||
<span class="n">max_seq_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_seq_len</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span>
|
||||
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_num_tokens</span>
|
||||
<span class="n">max_seq_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_seq_len</span>
|
||||
|
||||
<span class="n">build_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="n">BuildConfig</span><span class="p">(</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="n">max_batch_size</span> <span class="ow">or</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
<span class="n">max_num_tokens</span> <span class="o">=</span> <span class="n">max_num_tokens</span> <span class="ow">or</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span>
|
||||
<span class="n">max_seq_len</span> <span class="o">=</span> <span class="n">max_seq_len</span> <span class="ow">or</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span>
|
||||
|
||||
<span class="n">executor_config</span> <span class="o">=</span> <span class="n">tllm</span><span class="o">.</span><span class="n">ExecutorConfig</span><span class="p">(</span>
|
||||
<span class="n">max_beam_width</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">,</span>
|
||||
<span class="n">max_beam_width</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">,</span>
|
||||
<span class="n">scheduler_config</span><span class="o">=</span><span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">scheduler_config</span><span class="p">),</span>
|
||||
<span class="n">batching_type</span><span class="o">=</span><span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batching_type</span><span class="p">)</span>
|
||||
@ -1094,7 +1132,7 @@
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">peft_cache_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">executor_config</span><span class="o">.</span><span class="n">peft_cache_config</span> <span class="o">=</span> <span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">peft_cache_config</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">lora_plugin</span><span class="p">:</span>
|
||||
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">lora_plugin</span><span class="p">:</span>
|
||||
<span class="n">engine_config</span> <span class="o">=</span> <span class="n">EngineConfig</span><span class="o">.</span><span class="n">from_json_file</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span> <span class="o">/</span>
|
||||
<span class="s2">"config.json"</span><span class="p">)</span>
|
||||
<span class="n">lora_config</span> <span class="o">=</span> <span class="n">engine_config</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">lora_config</span>
|
||||
@ -1122,7 +1160,7 @@
|
||||
<span class="n">executor_config</span><span class="o">.</span><span class="n">normalize_log_probs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">normalize_log_probs</span>
|
||||
<span class="n">executor_config</span><span class="o">.</span><span class="n">enable_chunked_context</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">enable_chunked_prefill</span>
|
||||
<span class="n">executor_config</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">extended_runtime_perf_knob_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">extended_runtime_perf_knob_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="n">executor_config</span><span class="o">.</span><span class="n">extended_runtime_perf_knob_config</span> <span class="o">=</span> <span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">extended_runtime_perf_knob_config</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">cache_transceiver_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
@ -1132,9 +1170,11 @@
|
||||
<span class="n">update_executor_config</span><span class="p">(</span>
|
||||
<span class="n">executor_config</span><span class="p">,</span>
|
||||
<span class="n">backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span><span class="p">,</span>
|
||||
<span class="n">pytorch_backend_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">pytorch_backend_config</span><span class="p">,</span>
|
||||
<span class="n">pytorch_backend_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">get_pytorch_backend_config</span><span class="p">()</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">"pytorch"</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">mapping</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">to_mapping</span><span class="p">(),</span>
|
||||
<span class="n">build_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="p">,</span>
|
||||
<span class="n">build_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">speculative_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
|
||||
<span class="n">hf_model_dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_hf_model_dir</span><span class="p">,</span>
|
||||
<span class="n">trt_engine_dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span><span class="p">,</span>
|
||||
@ -1142,8 +1182,9 @@
|
||||
<span class="n">max_seq_len</span><span class="o">=</span><span class="n">max_seq_len</span><span class="p">)</span>
|
||||
<span class="n">executor_config</span><span class="o">.</span><span class="n">llm_parallel_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span>
|
||||
<span class="n">return_logits</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">gather_generation_logits</span> <span class="ow">or</span> <span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
|
||||
<span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">gather_context_logits</span><span class="p">)</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_executor</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor_cls</span><span class="o">.</span><span class="n">create</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span><span class="p">,</span>
|
||||
<span class="n">executor_config</span><span class="o">=</span><span class="n">executor_config</span><span class="p">,</span>
|
||||
@ -1160,6 +1201,10 @@
|
||||
<span class="n">is_llm_executor</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">lora_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span><span class="p">)</span>
|
||||
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_on_trt_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">,</span> <span class="n">TrtLlmArgs</span><span class="p">)</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_try_load_tokenizer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">TokenizerBase</span><span class="p">]:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">skip_tokenizer_init</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="kc">None</span>
|
||||
@ -1379,6 +1424,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -505,15 +509,18 @@
|
||||
<h1>Source code for tensorrt_llm.llmapi.llm_args</h1><div class="highlight"><pre>
|
||||
<span></span><span class="kn">import</span><span class="w"> </span><span class="nn">json</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">math</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">abc</span><span class="w"> </span><span class="kn">import</span> <span class="n">ABC</span><span class="p">,</span> <span class="n">abstractmethod</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">dataclasses</span><span class="w"> </span><span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">fields</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">dataclasses</span><span class="w"> </span><span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">enum</span><span class="w"> </span><span class="kn">import</span> <span class="n">Enum</span><span class="p">,</span> <span class="n">EnumMeta</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">ClassVar</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Literal</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Union</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">TYPE_CHECKING</span><span class="p">,</span> <span class="n">Any</span><span class="p">,</span> <span class="n">ClassVar</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Literal</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span>
|
||||
<span class="n">Union</span><span class="p">)</span>
|
||||
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">yaml</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">pydantic</span><span class="w"> </span><span class="kn">import</span> <span class="n">BaseModel</span><span class="p">,</span> <span class="n">Field</span><span class="p">,</span> <span class="n">validator</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">pydantic</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">BaseModel</span><span class="p">,</span> <span class="n">Field</span><span class="p">,</span> <span class="n">PrivateAttr</span><span class="p">,</span> <span class="n">field_validator</span><span class="p">,</span>
|
||||
<span class="n">model_validator</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">strenum</span><span class="w"> </span><span class="kn">import</span> <span class="n">StrEnum</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">PreTrainedTokenizerBase</span>
|
||||
|
||||
@ -522,23 +529,30 @@
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">mpi_rank</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..auto_parallel</span><span class="w"> </span><span class="kn">import</span> <span class="n">AutoParallelConfig</span><span class="p">,</span> <span class="n">infer_cluster_config</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.pyexecutor.config</span><span class="w"> </span><span class="kn">import</span> <span class="n">PyTorchConfig</span>
|
||||
|
||||
<span class="c1"># yapf: disable</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">BatchingType</span> <span class="k">as</span> <span class="n">_BatchingType</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> \
|
||||
<span class="n">CacheTransceiverConfig</span> <span class="k">as</span> <span class="n">_CacheTransceiverConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> \
|
||||
<span class="n">CapacitySchedulerPolicy</span> <span class="k">as</span> <span class="n">_CapacitySchedulerPolicy</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">ContextChunkingPolicy</span> <span class="k">as</span> <span class="n">_ContextChunkingPolicy</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">DecodingConfig</span><span class="p">,</span> <span class="n">DecodingMode</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">DynamicBatchConfig</span> <span class="k">as</span> <span class="n">_DynamicBatchConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">EagleConfig</span><span class="p">,</span> <span class="n">ExecutorConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> \
|
||||
<span class="n">ExtendedRuntimePerfKnobConfig</span> <span class="k">as</span> <span class="n">_ExtendedRuntimePerfKnobConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">KvCacheConfig</span> <span class="k">as</span> <span class="n">_KvCacheConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> \
|
||||
<span class="n">LookaheadDecodingConfig</span> <span class="k">as</span> <span class="n">_LookaheadDecodingConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">PeftCacheConfig</span> <span class="k">as</span> <span class="n">_PeftCacheConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">SchedulerConfig</span> <span class="k">as</span> <span class="n">_SchedulerConfig</span>
|
||||
<span class="c1"># isort: off</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span>
|
||||
<span class="n">BatchingType</span> <span class="k">as</span> <span class="n">_BatchingType</span><span class="p">,</span>
|
||||
<span class="n">CacheTransceiverConfig</span> <span class="k">as</span> <span class="n">_CacheTransceiverConfig</span><span class="p">,</span>
|
||||
<span class="n">CapacitySchedulerPolicy</span> <span class="k">as</span> <span class="n">_CapacitySchedulerPolicy</span><span class="p">,</span>
|
||||
<span class="n">ContextChunkingPolicy</span> <span class="k">as</span> <span class="n">_ContextChunkingPolicy</span><span class="p">,</span>
|
||||
<span class="n">DecodingConfig</span><span class="p">,</span>
|
||||
<span class="n">DecodingMode</span><span class="p">,</span>
|
||||
<span class="n">DynamicBatchConfig</span> <span class="k">as</span> <span class="n">_DynamicBatchConfig</span><span class="p">,</span>
|
||||
<span class="n">EagleConfig</span> <span class="k">as</span> <span class="n">_EagleConfig</span><span class="p">,</span>
|
||||
<span class="n">ExecutorConfig</span> <span class="k">as</span> <span class="n">_ExecutorConfig</span><span class="p">,</span>
|
||||
<span class="n">ExtendedRuntimePerfKnobConfig</span> <span class="k">as</span> <span class="n">_ExtendedRuntimePerfKnobConfig</span><span class="p">,</span>
|
||||
<span class="n">KvCacheConfig</span> <span class="k">as</span> <span class="n">_KvCacheConfig</span><span class="p">,</span>
|
||||
<span class="n">LookaheadDecodingConfig</span> <span class="k">as</span> <span class="n">_LookaheadDecodingConfig</span><span class="p">,</span>
|
||||
<span class="n">PeftCacheConfig</span> <span class="k">as</span> <span class="n">_PeftCacheConfig</span><span class="p">,</span>
|
||||
<span class="n">SchedulerConfig</span> <span class="k">as</span> <span class="n">_SchedulerConfig</span><span class="p">)</span> <span class="c1"># isort: skip</span>
|
||||
<span class="c1"># isort: on</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">PreTrainedTokenizerBase</span>
|
||||
|
||||
<span class="c1"># yapf: enable</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..builder</span><span class="w"> </span><span class="kn">import</span> <span class="n">BuildConfig</span><span class="p">,</span> <span class="n">EngineConfig</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
|
||||
@ -709,7 +723,8 @@
|
||||
<span class="s2">"MTP"</span><span class="p">:</span> <span class="n">MTPDecodingConfig</span><span class="p">,</span>
|
||||
<span class="s2">"Medusa"</span><span class="p">:</span> <span class="n">MedusaDecodingConfig</span><span class="p">,</span>
|
||||
<span class="s2">"Eagle"</span><span class="p">:</span> <span class="n">EagleDecodingConfig</span><span class="p">,</span>
|
||||
<span class="s2">"Lookahead"</span><span class="p">:</span> <span class="n">LookaheadDecodingConfig</span>
|
||||
<span class="s2">"Lookahead"</span><span class="p">:</span> <span class="n">LookaheadDecodingConfig</span><span class="p">,</span>
|
||||
<span class="s2">"NGram"</span><span class="p">:</span> <span class="n">NGramDecodingConfig</span><span class="p">,</span>
|
||||
<span class="p">}</span>
|
||||
|
||||
<span class="n">config_class</span> <span class="o">=</span> <span class="n">config_classes</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">decoding_type</span><span class="p">)</span>
|
||||
@ -750,6 +765,7 @@
|
||||
<span class="n">num_eagle_layers</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">max_non_leaves_per_layer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">pytorch_eagle_weights_path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
<span class="n">eagle3_one_model</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<div class="viewcode-block" id="EagleDecodingConfig.from_dict">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.EagleDecodingConfig.from_dict">[docs]</a>
|
||||
@ -762,6 +778,46 @@
|
||||
|
||||
|
||||
|
||||
<div class="viewcode-block" id="NGramDecodingConfig">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig">[docs]</a>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">NGramDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
<span class="sd"> Configuration for NGram drafter speculative decoding.</span>
|
||||
|
||||
<span class="sd"> Arguments:</span>
|
||||
<span class="sd"> prompt_lookup_num_tokens: int</span>
|
||||
<span class="sd"> The length maximum of draft tokens (can be understood as length maximum of output draft tokens).</span>
|
||||
|
||||
<span class="sd"> max_matching_ngram_size: int</span>
|
||||
<span class="sd"> The length maximum of searching tokens (can be understood as length maximum of input tokens to search).</span>
|
||||
|
||||
<span class="sd"> is_keep_all: bool = True</span>
|
||||
<span class="sd"> Whether to keep all candidate pattern-matches pairs, only one match is kept for each pattern if False.</span>
|
||||
|
||||
<span class="sd"> is_use_oldest: bool = True</span>
|
||||
<span class="sd"> Whether to provide the oldest match when pattern is hit, the newest one is provided if False.</span>
|
||||
|
||||
<span class="sd"> is_public_pool: bool = True</span>
|
||||
<span class="sd"> Whether to use a common pool for all requests, or the pool is private for each request if False.</span>
|
||||
<span class="sd"> """</span>
|
||||
|
||||
<span class="n">prompt_lookup_num_tokens</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span>
|
||||
<span class="n">max_matching_ngram_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span>
|
||||
<span class="n">is_keep_all</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="n">is_use_oldest</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
<span class="n">is_public_pool</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
|
||||
|
||||
<div class="viewcode-block" id="NGramDecodingConfig.from_dict">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.from_dict">[docs]</a>
|
||||
<span class="nd">@classmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">from_dict</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="nb">dict</span><span class="p">):</span>
|
||||
<span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"NGram"</span></div>
|
||||
|
||||
|
||||
|
||||
<div class="viewcode-block" id="MTPDecodingConfig">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig">[docs]</a>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">MTPDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
|
||||
@ -1063,7 +1119,9 @@
|
||||
|
||||
<div class="viewcode-block" id="LookaheadDecodingConfig.validate_positive_values">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values">[docs]</a>
|
||||
<span class="nd">@validator</span><span class="p">(</span><span class="s1">'max_window_size'</span><span class="p">,</span> <span class="s1">'max_ngram_size'</span><span class="p">,</span> <span class="s1">'max_verification_set_size'</span><span class="p">)</span>
|
||||
<span class="nd">@field_validator</span><span class="p">(</span><span class="s1">'max_window_size'</span><span class="p">,</span> <span class="s1">'max_ngram_size'</span><span class="p">,</span>
|
||||
<span class="s1">'max_verification_set_size'</span><span class="p">)</span>
|
||||
<span class="nd">@classmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_positive_values</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="n">v</span> <span class="o"><=</span> <span class="mi">0</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Value must be positive, got </span><span class="si">{</span><span class="n">v</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||||
@ -1270,7 +1328,10 @@
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">model</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="kc">None</span>
|
||||
|
||||
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">LlmArgs</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">BaseLlmArgs</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
<span class="sd"> Base class for both TorchLlmArgs and TrtLlmArgs. It contains all the arguments that are common to both.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">model_config</span> <span class="o">=</span> <span class="p">{</span>
|
||||
<span class="s2">"arbitrary_types_allowed"</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span>
|
||||
<span class="s2">"extra"</span><span class="p">:</span> <span class="s2">"allow"</span><span class="p">,</span>
|
||||
@ -1342,20 +1403,11 @@
|
||||
<span class="n">cp_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">dict</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="nb">dict</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Context parallel config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">auto_parallel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable auto parallel mode."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">auto_parallel_world_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"The world size for auto parallel mode."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">load_format</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s1">'auto'</span><span class="p">,</span> <span class="s1">'dummy'</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="s1">'auto'</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The format to load the model."</span><span class="p">,</span>
|
||||
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">"type"</span><span class="p">:</span> <span class="s2">"Literal['auto', 'dummy']"</span><span class="p">})</span>
|
||||
|
||||
<span class="n">enable_tqdm</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable tqdm for progress bar."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># LoRA arguments</span>
|
||||
<span class="n">enable_lora</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Enable LoRA."</span><span class="p">)</span>
|
||||
|
||||
@ -1387,18 +1439,9 @@
|
||||
<span class="n">quant_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">QuantConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Quantization config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">calib_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">CalibConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Calibration config."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># BuildConfig is introduced to give users a familiar interface to configure the model building.</span>
|
||||
<span class="n">build_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Build config."</span><span class="p">,</span>
|
||||
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">"type"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"Optional[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">]"</span><span class="p">})</span>
|
||||
|
||||
<span class="c1"># Several options from ExecutorConfig, expanded here for less hierarchy</span>
|
||||
<span class="n">kv_cache_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">KvCacheConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"KV cache config."</span><span class="p">)</span>
|
||||
<span class="n">kv_cache_config</span><span class="p">:</span> <span class="n">KvCacheConfig</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="n">KvCacheConfig</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"KV cache config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">enable_chunked_prefill</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable chunked prefill."</span><span class="p">)</span>
|
||||
@ -1421,29 +1464,12 @@
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The maximum number of iterations for request stats."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">workspace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The workspace for the model."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># A handful of options from PretrainedConfig</span>
|
||||
<span class="n">embedding_parallel_mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="s1">'SHARDING_ALONG_VOCAB'</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The embedding parallel mode."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">fast_build</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Enable fast build."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Once set, the model will reuse the build_cache</span>
|
||||
<span class="n">enable_build_cache</span><span class="p">:</span> <span class="nb">object</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable build cache."</span><span class="p">,</span>
|
||||
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span>
|
||||
<span class="s2">"type"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"Union[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildCacheConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">, bool]"</span>
|
||||
<span class="p">})</span>
|
||||
|
||||
<span class="n">peft_cache_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">PeftCacheConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"PEFT cache config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">scheduler_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">SchedulerConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Scheduler config."</span><span class="p">)</span>
|
||||
<span class="n">scheduler_config</span><span class="p">:</span> <span class="n">SchedulerConfig</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="n">SchedulerConfig</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Scheduler config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">cache_transceiver_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">CacheTransceiverConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Cache transceiver config."</span><span class="p">)</span>
|
||||
@ -1451,8 +1477,8 @@
|
||||
<span class="c1"># Speculative decoding parameters</span>
|
||||
<span class="n">speculative_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span>
|
||||
<span class="n">LookaheadDecodingConfig</span><span class="p">,</span> <span class="n">MedusaDecodingConfig</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">,</span>
|
||||
<span class="n">MTPDecodingConfig</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Speculative decoding config."</span><span class="p">)</span>
|
||||
<span class="n">MTPDecodingConfig</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Speculative decoding config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">batching_type</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">BatchingType</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Batching type."</span><span class="p">)</span>
|
||||
@ -1460,13 +1486,6 @@
|
||||
<span class="n">normalize_log_probs</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Normalize log probabilities."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">gather_generation_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Gather generation logits."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">extended_runtime_perf_knob_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
|
||||
<span class="n">ExtendedRuntimePerfKnobConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Extended runtime perf knob config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">max_batch_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The maximum batch size."</span><span class="p">)</span>
|
||||
|
||||
@ -1487,6 +1506,9 @@
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The backend to use."</span><span class="p">,</span>
|
||||
<span class="n">exclude</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
|
||||
|
||||
<span class="n">gather_generation_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Gather generation logits."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># private fields those are unstable and just for internal use</span>
|
||||
<span class="n">num_postprocess_workers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
|
||||
@ -1559,40 +1581,19 @@
|
||||
<span class="n">moe_tp_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_tensor_parallel_size</span><span class="p">,</span>
|
||||
<span class="n">moe_ep_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_expert_parallel_size</span><span class="p">,</span>
|
||||
<span class="n">enable_attention_dp</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_attention_dp</span><span class="p">,</span>
|
||||
<span class="n">cp_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cp_config</span><span class="p">,</span>
|
||||
<span class="n">auto_parallel</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">auto_parallel</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">world_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel_world_size</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel_config</span> <span class="o">=</span> <span class="n">AutoParallelConfig</span><span class="p">(</span>
|
||||
<span class="n">sharded_io_allowlist</span><span class="o">=</span><span class="p">[</span>
|
||||
<span class="s2">"past_key_value_</span><span class="se">\\</span><span class="s2">d+"</span><span class="p">,</span>
|
||||
<span class="s2">"present_key_value_</span><span class="se">\\</span><span class="s2">d*"</span><span class="p">,</span>
|
||||
<span class="p">],</span>
|
||||
<span class="n">same_buffer_io</span><span class="o">=</span><span class="p">{</span>
|
||||
<span class="s2">"past_key_value_(</span><span class="se">\\</span><span class="s2">d+)"</span><span class="p">:</span> <span class="s2">"present_key_value_</span><span class="se">\\</span><span class="s2">1"</span><span class="p">,</span>
|
||||
<span class="p">},</span>
|
||||
<span class="o">**</span><span class="n">infer_cluster_config</span><span class="p">(),</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span> <span class="ow">or</span> <span class="n">KvCacheConfig</span><span class="p">()</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">scheduler_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">scheduler_config</span> <span class="ow">or</span> <span class="n">SchedulerConfig</span><span class="p">()</span>
|
||||
|
||||
<span class="c1"># This is used to hold th options for convert_checkpoint</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_convert_checkpoint_options</span> <span class="o">=</span> <span class="p">{}</span>
|
||||
<span class="n">cp_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cp_config</span><span class="p">)</span>
|
||||
|
||||
<span class="nd">@classmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">from_kwargs</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"LlmArgs"</span><span class="p">:</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">from_kwargs</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"BaseLlmArgs"</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">"""Create `LlmArgs` instance from kwargs.</span>
|
||||
|
||||
<span class="sd"> Args:</span>
|
||||
<span class="sd"> kwargs (Any): Arguments passed to `LlmArgs` constructor.</span>
|
||||
|
||||
<span class="sd"> Returns:</span>
|
||||
<span class="sd"> tensorrt_llm.llmapi.llm_utils.LlmArgs: The `LlmArgs` instance.</span>
|
||||
<span class="sd"> tensorrt_llm.llmapi.llm_utils.BaseLlmArgs: The `BaseLlmArgs` instance.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">kwargs</span> <span class="o">=</span> <span class="n">LlmArgs</span><span class="o">.</span><span class="n">_maybe_update_config_for_consistency</span><span class="p">(</span><span class="nb">dict</span><span class="p">(</span><span class="n">kwargs</span><span class="p">))</span>
|
||||
<span class="n">kwargs</span> <span class="o">=</span> <span class="n">BaseLlmArgs</span><span class="o">.</span><span class="n">_maybe_update_config_for_consistency</span><span class="p">(</span><span class="nb">dict</span><span class="p">(</span><span class="n">kwargs</span><span class="p">))</span>
|
||||
<span class="n">ret</span> <span class="o">=</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
|
||||
<span class="n">ret</span><span class="o">.</span><span class="n">_setup</span><span class="p">()</span>
|
||||
<span class="k">return</span> <span class="n">ret</span>
|
||||
@ -1603,8 +1604,7 @@
|
||||
<span class="sd"> Returns:</span>
|
||||
<span class="sd"> dict: The dict that contains all fields of the `LlmArgs` instance.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="k">return</span> <span class="nb">dict</span><span class="p">(</span>
|
||||
<span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">fields</span><span class="p">(</span><span class="bp">self</span><span class="p">))</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_dump</span><span class="p">()</span>
|
||||
|
||||
<span class="nd">@staticmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_maybe_update_config_for_consistency</span><span class="p">(</span>
|
||||
@ -1612,18 +1612,18 @@
|
||||
<span class="c1"># max_beam_width is not included since vague behavior due to lacking the support for dynamic beam width during</span>
|
||||
<span class="c1"># generation</span>
|
||||
<span class="n">black_list</span> <span class="o">=</span> <span class="nb">set</span><span class="p">([</span><span class="s2">"max_beam_width"</span><span class="p">])</span>
|
||||
<span class="n">executor_config_attrs</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">attr</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="nb">dir</span><span class="p">(</span><span class="n">ExecutorConfig</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="n">attr</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s1">'_'</span><span class="p">)</span>
|
||||
<span class="ow">and</span> <span class="nb">callable</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="n">ExecutorConfig</span><span class="p">,</span> <span class="n">attr</span><span class="p">)))</span>
|
||||
<span class="n">executor_config_attrs</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span>
|
||||
<span class="n">attr</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="nb">dir</span><span class="p">(</span><span class="n">_ExecutorConfig</span><span class="p">)</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">attr</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s1">'_'</span><span class="p">)</span>
|
||||
<span class="ow">and</span> <span class="nb">callable</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="n">_ExecutorConfig</span><span class="p">,</span> <span class="n">attr</span><span class="p">)))</span>
|
||||
<span class="n">executor_config_attrs</span> <span class="o">-=</span> <span class="n">black_list</span>
|
||||
<span class="n">llm_args_attr</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">LlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
|
||||
<span class="c1"># NOTE: When cpp ExecutorConfig add new options, please add the new options into `_LlmArgs` with docs as well</span>
|
||||
<span class="n">llm_args_attr</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">BaseLlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
|
||||
<span class="c1"># NOTE: When cpp ExecutorConfig add new options, please add the new options into `LlmArgs` with docs as well</span>
|
||||
<span class="c1"># ASK chunweiy for help if you are not sure about the new options.</span>
|
||||
<span class="k">assert</span> <span class="n">executor_config_attrs</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span>
|
||||
<span class="n">llm_args_attr</span>
|
||||
<span class="p">),</span> <span class="sa">f</span><span class="s2">"New options found in underlying ExecutorConfig: </span><span class="si">{</span><span class="n">llm_args_attr</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">executor_config_attrs</span><span class="si">}</span><span class="s2">"</span>
|
||||
|
||||
<span class="c1"># ensure build_config and LlmArgs consistency</span>
|
||||
<span class="c1"># ensure build_config and LlmArgsBase consistency</span>
|
||||
<span class="k">if</span> <span class="n">kwargs_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"backend"</span><span class="p">)</span> <span class="o">!=</span> <span class="s2">"pytorch"</span> <span class="ow">and</span> <span class="n">kwargs_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="s2">"build_config"</span><span class="p">):</span>
|
||||
<span class="c1"># TODO: move this to _perform_config_arbitration() once it's default-on.</span>
|
||||
@ -1633,11 +1633,11 @@
|
||||
<span class="n">build_val</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">kwargs_dict</span><span class="p">[</span><span class="s2">"build_config"</span><span class="p">],</span> <span class="n">field_name</span><span class="p">,</span>
|
||||
<span class="kc">None</span><span class="p">)</span>
|
||||
<span class="n">llmargs_val</span> <span class="o">=</span> <span class="n">kwargs_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
|
||||
<span class="n">field_name</span><span class="p">)</span> <span class="ow">or</span> <span class="n">LlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="n">field_name</span><span class="p">]</span>
|
||||
<span class="n">field_name</span><span class="p">)</span> <span class="ow">or</span> <span class="n">BaseLlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="n">field_name</span><span class="p">]</span>
|
||||
|
||||
<span class="k">if</span> <span class="n">build_val</span> <span class="o">!=</span> <span class="n">llmargs_val</span><span class="p">:</span>
|
||||
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Overriding LlmArgs.</span><span class="si">{</span><span class="n">field_name</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">llmargs_val</span><span class="si">}</span><span class="s2">) with build_config.</span><span class="si">{</span><span class="n">field_name</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">build_val</span><span class="si">}</span><span class="s2">)."</span>
|
||||
<span class="sa">f</span><span class="s2">"Overriding LlmArgsBase.</span><span class="si">{</span><span class="n">field_name</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">llmargs_val</span><span class="si">}</span><span class="s2">) with build_config.</span><span class="si">{</span><span class="n">field_name</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">build_val</span><span class="si">}</span><span class="s2">)."</span>
|
||||
<span class="p">)</span>
|
||||
<span class="n">kwargs_dict</span><span class="p">[</span><span class="n">field_name</span><span class="p">]</span> <span class="o">=</span> <span class="n">build_val</span>
|
||||
|
||||
@ -1646,12 +1646,15 @@
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_setup</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">''' This method will setup the configs right before building the model. '''</span>
|
||||
|
||||
<span class="n">is_trt_llm_args</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">TrtLlmArgs</span><span class="p">)</span>
|
||||
|
||||
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">,</span>
|
||||
<span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">)),</span> <span class="sa">f</span><span class="s2">"Invalid model: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="si">}</span><span class="s2">"</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_setup_embedding_parallel_mode</span><span class="p">()</span>
|
||||
<span class="k">if</span> <span class="n">is_trt_llm_args</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_setup_embedding_parallel_mode</span><span class="p">()</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="n">is_trt_llm_args</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span> <span class="o">=</span> <span class="n">BuildCacheConfig</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span><span class="p">,</span> <span class="n">BuildCacheConfig</span><span class="p">):</span>
|
||||
@ -1692,7 +1695,8 @@
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span> <span class="ow">or</span> <span class="n">QuantConfig</span><span class="p">()</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">calib_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">calib_config</span> <span class="ow">or</span> <span class="n">CalibConfig</span><span class="p">()</span>
|
||||
<span class="k">if</span> <span class="n">is_trt_llm_args</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">calib_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">calib_config</span> <span class="ow">or</span> <span class="n">CalibConfig</span><span class="p">()</span>
|
||||
|
||||
<span class="c1"># Note: max_batch_size and max_num_tokens in LlmArgs are for runtime,</span>
|
||||
<span class="c1"># which will be passed to the C++ Executor API, overwriting the values</span>
|
||||
@ -1719,8 +1723,9 @@
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span>
|
||||
|
||||
<span class="c1"># TODO: remove the checker when manage weights support all data types</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">fast_build</span> <span class="ow">and</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span> <span class="ow">is</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">FP8</span>
|
||||
<span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="n">is_trt_llm_args</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">fast_build</span> <span class="ow">and</span> <span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span> <span class="ow">is</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">FP8</span>
|
||||
<span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_update_plugin_config</span><span class="p">(</span><span class="s2">"manage_weights"</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">_world_size</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
|
||||
@ -1733,9 +1738,12 @@
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_lora_rank</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">max_lora_rank</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_lora_rank</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_setup_speculative_config</span><span class="p">()</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_prompt_adapter</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_prompt_embedding_table_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_prompt_adapter_token</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_setup_speculative_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">LookaheadDecodingConfig</span><span class="p">):</span>
|
||||
<span class="n">lookahead_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span>
|
||||
@ -1765,7 +1773,7 @@
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">!=</span> <span class="s1">'pytorch'</span><span class="p">:</span>
|
||||
<span class="n">eagle_config</span> <span class="o">=</span> <span class="n">EagleConfig</span><span class="p">(</span>
|
||||
<span class="n">eagle_config</span> <span class="o">=</span> <span class="n">_EagleConfig</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">eagle_choices</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">greedy_sampling</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">posterior_threshold</span><span class="p">,</span>
|
||||
@ -1778,9 +1786,25 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">Eagle3Config</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">Eagle3Config</span><span class="p">(</span>
|
||||
<span class="n">max_draft_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">,</span>
|
||||
<span class="n">eagle_weights_path</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">pytorch_eagle_weights_path</span><span class="p">)</span>
|
||||
|
||||
<span class="n">draft_model_path</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">pytorch_eagle_weights_path</span><span class="p">,</span>
|
||||
<span class="n">eagle3_one_model</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">eagle3_one_model</span><span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">):</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NGRAM</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s1">'pytorch'</span>
|
||||
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">prompt_lookup_num_tokens</span> <span class="o">></span> <span class="mi">0</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_matching_ngram_size</span> <span class="o">></span> <span class="mi">0</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">NGramConfig</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">NGramConfig</span><span class="p">(</span>
|
||||
<span class="n">prompt_lookup_num_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">prompt_lookup_num_tokens</span><span class="p">,</span>
|
||||
<span class="n">max_matching_ngram_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
|
||||
<span class="n">max_matching_ngram_size</span><span class="p">,</span>
|
||||
<span class="n">is_keep_all</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_keep_all</span><span class="p">,</span>
|
||||
<span class="n">is_use_oldest</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_use_oldest</span><span class="p">,</span>
|
||||
<span class="n">is_public_pool</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_public_pool</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MTPDecodingConfig</span><span class="p">):</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">MTPConfig</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">MTPConfig</span><span class="p">(</span>
|
||||
@ -1921,32 +1945,409 @@
|
||||
<span class="sa">f</span><span class="s2">"Invalid embedding_parallel_mode: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">llm_args</span><span class="o">.</span><span class="n">embedding_parallel_mode</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_validate_kv_cache_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"KvCacheConfig is required for streaming LLM."</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">max_attention_window</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="s2">"KvCacheConfig.max_attention_window should be set for streaming LLM."</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">i</span> <span class="o"><=</span> <span class="mi">0</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">max_attention_window</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="s2">"Elements in KvCacheConfig.max_attention_window should be greater than 0."</span>
|
||||
<span class="p">)</span>
|
||||
<div class="viewcode-block" id="TrtLlmArgs">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs">[docs]</a>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">TrtLlmArgs</span><span class="p">(</span><span class="n">BaseLlmArgs</span><span class="p">):</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">sink_token_length</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="s2">"KvCacheConfig.sink_token_length should be set for streaming LLM."</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">sink_token_length</span> <span class="o"><=</span> <span class="mi">0</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="s2">"KvCacheConfig.sink_token_length should be greater than 0."</span><span class="p">)</span>
|
||||
<span class="n">auto_parallel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable auto parallel mode."</span><span class="p">,</span>
|
||||
<span class="n">deprecated</span><span class="o">=</span>
|
||||
<span class="s2">"Use tensor_parallel_size/pipeline_parallel_size/xxx_parallel_size instead."</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">auto_parallel_world_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The world size for auto parallel mode."</span><span class="p">,</span>
|
||||
<span class="n">deprecated</span><span class="o">=</span>
|
||||
<span class="s2">"Use tensor_parallel_size/pipeline_parallel_size/xxx_parallel_size instead."</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">enable_tqdm</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable tqdm for progress bar."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># BuildConfig is introduced to give users a familiar interface to configure the model building.</span>
|
||||
<span class="n">build_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Build config."</span><span class="p">,</span>
|
||||
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">"type"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"Optional[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">]"</span><span class="p">})</span>
|
||||
|
||||
<span class="n">workspace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The workspace for the model."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Once set, the model will reuse the build_cache</span>
|
||||
<span class="n">enable_build_cache</span><span class="p">:</span> <span class="nb">object</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable build cache."</span><span class="p">,</span>
|
||||
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span>
|
||||
<span class="s2">"type"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"Union[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildCacheConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">, bool]"</span>
|
||||
<span class="p">})</span>
|
||||
|
||||
<span class="n">extended_runtime_perf_knob_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
|
||||
<span class="n">ExtendedRuntimePerfKnobConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Extended runtime perf knob config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">calib_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">CalibConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Calibration config."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">embedding_parallel_mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="s1">'SHARDING_ALONG_VOCAB'</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"The embedding parallel mode."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">fast_build</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Enable fast build."</span><span class="p">)</span>
|
||||
|
||||
<span class="c1"># Private attributes</span>
|
||||
<span class="n">_auto_parallel_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">AutoParallelConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
|
||||
<span class="c1"># This is used to hold the options for convert_checkpoint</span>
|
||||
<span class="n">_convert_checkpoint_options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span>
|
||||
<span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="nb">dict</span><span class="p">)</span>
|
||||
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">auto_parallel_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">AutoParallelConfig</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_auto_parallel_config</span>
|
||||
|
||||
<div class="viewcode-block" id="TrtLlmArgs.model_post_init">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.model_post_init">[docs]</a>
|
||||
<span class="nd">@print_traceback_on_error</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">model_post_init</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__context</span><span class="p">):</span>
|
||||
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">model_post_init</span><span class="p">(</span><span class="n">__context</span><span class="p">)</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_auto_parallel_config</span> <span class="o">=</span> <span class="n">AutoParallelConfig</span><span class="p">(</span>
|
||||
<span class="n">sharded_io_allowlist</span><span class="o">=</span><span class="p">[</span>
|
||||
<span class="s2">"past_key_value_</span><span class="se">\\</span><span class="s2">d+"</span><span class="p">,</span>
|
||||
<span class="s2">"present_key_value_</span><span class="se">\\</span><span class="s2">d*"</span><span class="p">,</span>
|
||||
<span class="p">],</span>
|
||||
<span class="n">same_buffer_io</span><span class="o">=</span><span class="p">{</span>
|
||||
<span class="s2">"past_key_value_(</span><span class="se">\\</span><span class="s2">d+)"</span><span class="p">:</span> <span class="s2">"present_key_value_</span><span class="se">\\</span><span class="s2">1"</span><span class="p">,</span>
|
||||
<span class="p">},</span>
|
||||
<span class="o">**</span><span class="n">infer_cluster_config</span><span class="p">(),</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">auto_parallel</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">auto_parallel</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">world_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel_world_size</span></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<span class="n">LlmArgs</span> <span class="o">=</span> <span class="n">TrtLlmArgs</span>
|
||||
|
||||
<span class="n">LLMARGS_EXPLICIT_DOCSTRING</span> <span class="o">=</span> <span class="n">generate_api_docs_as_docstring</span><span class="p">(</span><span class="n">LlmArgs</span><span class="p">,</span>
|
||||
<span class="n">indent</span><span class="o">=</span><span class="s1">' '</span> <span class="o">*</span> <span class="mi">4</span><span class="p">)</span>
|
||||
|
||||
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">LoadFormat</span><span class="p">(</span><span class="n">Enum</span><span class="p">):</span>
|
||||
<span class="n">AUTO</span> <span class="o">=</span> <span class="mi">0</span>
|
||||
<span class="c1"># Initialize all weights randomly.</span>
|
||||
<span class="n">DUMMY</span> <span class="o">=</span> <span class="mi">1</span>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="TorchLlmArgs">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs">[docs]</a>
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">TorchLlmArgs</span><span class="p">(</span><span class="n">BaseLlmArgs</span><span class="p">):</span>
|
||||
|
||||
<span class="c1"># Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs</span>
|
||||
<span class="n">build_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Build config."</span><span class="p">,</span>
|
||||
<span class="n">exclude_from_json</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">"type"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"Optional[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">]"</span><span class="p">})</span>
|
||||
|
||||
<span class="c1"># PyTorch backend specific configurations</span>
|
||||
|
||||
<span class="n">use_cuda_graph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_batch_sizes, and are enabled for batches that consist of decoding requests *only* (the reason is that it's hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory."</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">cuda_graph_batch_sizes</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"List of batch sizes to create CUDA graphs for."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">cuda_graph_max_batch_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Maximum batch size for CUDA graphs."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">cuda_graph_padding_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">disable_overlap_scheduler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Disable the overlap scheduler."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">moe_max_num_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used."</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">moe_load_balancer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">object</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Configuration for MoE load balancing."</span><span class="p">,</span>
|
||||
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">"type"</span><span class="p">:</span> <span class="s2">"Union[MoeLoadBalancerConfig, str]"</span><span class="p">})</span>
|
||||
|
||||
<span class="n">attn_backend</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">'TRTLLM'</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Attention backend to use."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">moe_backend</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">'CUTLASS'</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"MoE backend to use."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">mixed_sampler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">enable_trtllm_sampler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies."</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">kv_cache_dtype</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s2">"auto"</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Data type for KV cache."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">use_kv_cache</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Whether to use KV cache."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">enable_iter_perf_stats</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Enable iteration performance statistics."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">enable_iter_req_stats</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"If true, enables per request stats per iteration. Must also set enable_iter_perf_stats to true to get request stats."</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">print_iter_log</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Print iteration logs."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">torch_compile_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Enable torch.compile optimization."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">torch_compile_fullgraph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable full graph compilation in torch.compile."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">torch_compile_inductor_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"Enable inductor backend in torch.compile."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">torch_compile_piecewise_cuda_graph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable piecewise CUDA graph in torch.compile."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">torch_compile_enable_userbuffers</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"When torch compile is enabled, userbuffers is enabled by default."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">autotuner_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Enable autotuner only when torch compile is enabled."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">enable_layerwise_nvtx_marker</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">"If true, enable layerwise nvtx marker."</span><span class="p">)</span>
|
||||
|
||||
<span class="n">auto_deploy_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span><span class="s2">"Auto deploy config."</span><span class="p">,</span>
|
||||
<span class="n">exclude_from_json</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
|
||||
<span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">"type"</span><span class="p">:</span> <span class="sa">f</span><span class="s2">"Optional[AutoDeployConfig]"</span><span class="p">})</span>
|
||||
|
||||
<span class="n">load_format</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">LoadFormat</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="n">LoadFormat</span><span class="o">.</span><span class="n">AUTO</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"How to load the model weights. By default, detect the weight type from the model checkpoint."</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="n">enable_min_latency</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
|
||||
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
|
||||
<span class="n">description</span><span class="o">=</span>
|
||||
<span class="s2">"If true, enable min-latency mode. Currently only used for Llama4."</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<div class="viewcode-block" id="TorchLlmArgs.convert_load_format">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.convert_load_format">[docs]</a>
|
||||
<span class="nd">@field_validator</span><span class="p">(</span><span class="s1">'load_format'</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">'before'</span><span class="p">)</span>
|
||||
<span class="nd">@classmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">convert_load_format</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="n">LoadFormat</span><span class="p">):</span>
|
||||
<span class="k">return</span> <span class="n">v</span>
|
||||
<span class="n">load_format</span> <span class="o">=</span> <span class="n">v</span><span class="o">.</span><span class="n">upper</span><span class="p">()</span>
|
||||
<span class="k">if</span> <span class="n">load_format</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">LoadFormat</span><span class="o">.</span><span class="n">__members__</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Invalid LoadFormat: </span><span class="si">{</span><span class="n">v</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="n">LoadFormat</span><span class="p">[</span><span class="n">load_format</span><span class="p">]</span></div>
|
||||
|
||||
|
||||
<span class="c1"># Extra resource managers to use in addition to the KV cache manager.</span>
|
||||
<span class="c1"># Each manager's prepare_resources method is called before the forward pass,</span>
|
||||
<span class="c1"># and update_resources() is called after the pass finishes. free_resources()</span>
|
||||
<span class="c1"># is called when a request finishes. The KV cache manager is guaranteed to</span>
|
||||
<span class="c1"># be invoked after all of these extra managers in all stages.</span>
|
||||
<span class="n">_extra_resource_managers</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span>
|
||||
<span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="nb">dict</span><span class="p">,</span> <span class="p">)</span>
|
||||
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">extra_resource_managers</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">]:</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_extra_resource_managers</span>
|
||||
|
||||
<span class="nd">@extra_resource_managers</span><span class="o">.</span><span class="n">setter</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">extra_resource_managers</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">_extra_resource_managers</span> <span class="o">=</span> <span class="n">value</span>
|
||||
|
||||
<div class="viewcode-block" id="TorchLlmArgs.model_post_init">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.model_post_init">[docs]</a>
|
||||
<span class="nd">@print_traceback_on_error</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">model_post_init</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__context</span><span class="p">):</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.._torch.model_config</span><span class="w"> </span><span class="kn">import</span> <span class="n">MoeLoadBalancerConfig</span>
|
||||
|
||||
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">model_post_init</span><span class="p">(</span><span class="n">__context</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">model_format</span> <span class="o">=</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">HF</span>
|
||||
|
||||
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">FileNotFoundError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"MoE load balancer config file not found: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">try</span><span class="p">:</span>
|
||||
<span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
|
||||
<span class="n">moe_load_balancer_config</span> <span class="o">=</span> <span class="n">yaml</span><span class="o">.</span><span class="n">safe_load</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span> <span class="o">=</span> <span class="n">MoeLoadBalancerConfig</span><span class="p">(</span>
|
||||
<span class="o">**</span><span class="n">moe_load_balancer_config</span><span class="p">)</span>
|
||||
<span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"Failed to load MoE load balancer config file: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span> <span class="kn">from</span><span class="w"> </span><span class="nn">e</span></div>
|
||||
|
||||
|
||||
<span class="c1"># TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig</span>
|
||||
<div class="viewcode-block" id="TorchLlmArgs.get_pytorch_backend_config">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">get_pytorch_backend_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PyTorchConfig"</span><span class="p">:</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.pyexecutor.config</span><span class="w"> </span><span class="kn">import</span> <span class="n">PyTorchConfig</span>
|
||||
|
||||
<span class="c1"># TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig</span>
|
||||
<span class="c1"># Just a WAR to support the auto_deploy</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_deploy_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_deploy_config</span>
|
||||
|
||||
<span class="k">return</span> <span class="n">PyTorchConfig</span><span class="p">(</span>
|
||||
<span class="n">extra_resource_managers</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">extra_resource_managers</span><span class="p">,</span>
|
||||
<span class="n">use_cuda_graph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">use_cuda_graph</span><span class="p">,</span>
|
||||
<span class="n">cuda_graph_batch_sizes</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="p">,</span>
|
||||
<span class="n">cuda_graph_max_batch_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span><span class="p">,</span>
|
||||
<span class="n">cuda_graph_padding_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_padding_enabled</span><span class="p">,</span>
|
||||
<span class="n">disable_overlap_scheduler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">disable_overlap_scheduler</span><span class="p">,</span>
|
||||
<span class="n">moe_max_num_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_max_num_tokens</span><span class="p">,</span>
|
||||
<span class="n">moe_load_balancer</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">,</span>
|
||||
<span class="n">attn_backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">attn_backend</span><span class="p">,</span>
|
||||
<span class="n">moe_backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_backend</span><span class="p">,</span>
|
||||
<span class="n">mixed_sampler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">mixed_sampler</span><span class="p">,</span>
|
||||
<span class="n">enable_trtllm_sampler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_trtllm_sampler</span><span class="p">,</span>
|
||||
<span class="n">kv_cache_dtype</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_dtype</span><span class="p">,</span>
|
||||
<span class="n">use_kv_cache</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">use_kv_cache</span><span class="p">,</span>
|
||||
<span class="n">enable_iter_perf_stats</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_iter_perf_stats</span><span class="p">,</span>
|
||||
<span class="n">enable_iter_req_stats</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_iter_req_stats</span><span class="p">,</span>
|
||||
<span class="n">print_iter_log</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">print_iter_log</span><span class="p">,</span>
|
||||
<span class="n">torch_compile_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_enabled</span><span class="p">,</span>
|
||||
<span class="n">torch_compile_fullgraph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_fullgraph</span><span class="p">,</span>
|
||||
<span class="n">torch_compile_inductor_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_inductor_enabled</span><span class="p">,</span>
|
||||
<span class="n">torch_compile_piecewise_cuda_graph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span>
|
||||
<span class="n">torch_compile_piecewise_cuda_graph</span><span class="p">,</span>
|
||||
<span class="n">torch_compile_enable_userbuffers</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span>
|
||||
<span class="n">torch_compile_enable_userbuffers</span><span class="p">,</span>
|
||||
<span class="n">autotuner_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">autotuner_enabled</span><span class="p">,</span>
|
||||
<span class="n">enable_layerwise_nvtx_marker</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_layerwise_nvtx_marker</span><span class="p">,</span>
|
||||
<span class="n">load_format</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">load_format</span><span class="p">,</span>
|
||||
<span class="n">enable_min_latency</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_min_latency</span><span class="p">)</span></div>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="TorchLlmArgs.validate_cuda_graph_max_batch_size">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_max_batch_size">[docs]</a>
|
||||
<span class="nd">@field_validator</span><span class="p">(</span><span class="s1">'cuda_graph_max_batch_size'</span><span class="p">)</span>
|
||||
<span class="nd">@classmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_cuda_graph_max_batch_size</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">"""Validate cuda_graph_max_batch_size is non-negative."""</span>
|
||||
<span class="k">if</span> <span class="n">v</span> <span class="o"><</span> <span class="mi">0</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"cuda_graph_max_batch_size must be non-negative"</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="n">v</span></div>
|
||||
|
||||
|
||||
<span class="nd">@staticmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">_generate_cuda_graph_batch_sizes</span><span class="p">(</span><span class="n">max_batch_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
|
||||
<span class="n">padding_enabled</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
|
||||
<span class="w"> </span><span class="sd">"""Generate a list of batch sizes for CUDA graphs.</span>
|
||||
|
||||
<span class="sd"> Args:</span>
|
||||
<span class="sd"> max_batch_size: Maximum batch size to generate up to</span>
|
||||
<span class="sd"> padding_enabled: Whether padding is enabled, which affects the batch size distribution</span>
|
||||
|
||||
<span class="sd"> Returns:</span>
|
||||
<span class="sd"> List of batch sizes to create CUDA graphs for</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="k">if</span> <span class="n">padding_enabled</span><span class="p">:</span>
|
||||
<span class="n">batch_sizes</span> <span class="o">=</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">4</span><span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="n">i</span> <span class="o">*</span> <span class="mi">8</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">17</span><span class="p">)]</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">batch_sizes</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">32</span><span class="p">))</span> <span class="o">+</span> <span class="p">[</span><span class="mi">32</span><span class="p">,</span> <span class="mi">64</span><span class="p">,</span> <span class="mi">128</span><span class="p">]</span>
|
||||
|
||||
<span class="c1"># Add powers of 2 up to max_batch_size</span>
|
||||
<span class="n">batch_sizes</span> <span class="o">+=</span> <span class="p">[</span>
|
||||
<span class="mi">2</span><span class="o">**</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="n">math</span><span class="o">.</span><span class="n">floor</span><span class="p">(</span><span class="n">math</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">max_batch_size</span><span class="p">,</span> <span class="mi">2</span><span class="p">)))</span>
|
||||
<span class="p">]</span>
|
||||
|
||||
<span class="c1"># Filter and sort batch sizes</span>
|
||||
<span class="n">batch_sizes</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span>
|
||||
<span class="p">[</span><span class="n">size</span> <span class="k">for</span> <span class="n">size</span> <span class="ow">in</span> <span class="n">batch_sizes</span> <span class="k">if</span> <span class="n">size</span> <span class="o"><=</span> <span class="n">max_batch_size</span><span class="p">])</span>
|
||||
|
||||
<span class="c1"># Add max_batch_size if not already included</span>
|
||||
<span class="k">if</span> <span class="n">max_batch_size</span> <span class="o">!=</span> <span class="n">batch_sizes</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]:</span>
|
||||
<span class="n">batch_sizes</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">max_batch_size</span><span class="p">)</span>
|
||||
|
||||
<span class="k">return</span> <span class="n">batch_sizes</span>
|
||||
|
||||
<div class="viewcode-block" id="TorchLlmArgs.validate_cuda_graph_config">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_config">[docs]</a>
|
||||
<span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s1">'after'</span><span class="p">)</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">validate_cuda_graph_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s1">'TorchLlmArgs'</span><span class="p">:</span>
|
||||
<span class="w"> </span><span class="sd">"""Validate CUDA graph configuration.</span>
|
||||
|
||||
<span class="sd"> Ensures that:</span>
|
||||
<span class="sd"> 1. If cuda_graph_batch_sizes is provided, cuda_graph_max_batch_size must be 0</span>
|
||||
<span class="sd"> 2. If cuda_graph_batch_sizes is not provided, it is generated based on cuda_graph_max_batch_size</span>
|
||||
<span class="sd"> 3. If both are provided, cuda_graph_batch_sizes must match the generated values</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate_cuda_graph_batch_sizes</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_padding_enabled</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="s2">"Please don't set both cuda_graph_batch_sizes "</span>
|
||||
<span class="s2">"and cuda_graph_max_batch_size.</span><span class="se">\n</span><span class="s2">"</span>
|
||||
<span class="sa">f</span><span class="s2">"cuda_graph_batch_sizes: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="si">}</span><span class="s2">, "</span>
|
||||
<span class="sa">f</span><span class="s2">"cuda_graph_max_batch_size: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span><span class="si">}</span><span class="s2">"</span>
|
||||
<span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="p">)</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span> <span class="ow">or</span> <span class="mi">128</span>
|
||||
<span class="n">generated_sizes</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate_cuda_graph_batch_sizes</span><span class="p">(</span>
|
||||
<span class="n">max_batch_size</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_padding_enabled</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="o">=</span> <span class="n">generated_sizes</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span> <span class="o">=</span> <span class="n">max_batch_size</span>
|
||||
|
||||
<span class="k">return</span> <span class="bp">self</span></div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">update_llm_args_with_extra_dict</span><span class="p">(</span>
|
||||
<span class="n">llm_args</span><span class="p">:</span> <span class="n">Dict</span><span class="p">,</span>
|
||||
<span class="n">llm_args_dict</span><span class="p">:</span> <span class="n">Dict</span><span class="p">,</span>
|
||||
@ -2126,6 +2527,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -688,9 +692,6 @@
|
||||
<div class="viewcode-block" id="MpiCommSession.__init__">
|
||||
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MpiCommSession.__init__">[docs]</a>
|
||||
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">comm</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">n_workers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="ow">not</span> <span class="n">external_mpi_comm_available</span><span class="p">(</span><span class="n">n_workers</span><span class="p">):</span>
|
||||
<span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s1">'The LLM instance should be launched by mpirun.'</span><span class="p">)</span>
|
||||
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">comm</span> <span class="o">=</span> <span class="n">comm</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">n_workers</span> <span class="o">=</span> <span class="n">n_workers</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">thread_pool</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">ThreadPoolExecutor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
@ -1147,6 +1148,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -873,6 +877,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1177,6 +1181,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -785,6 +789,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -802,6 +806,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1001,6 +1005,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -830,6 +834,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -661,6 +665,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -914,6 +918,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -812,6 +816,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -676,6 +680,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -802,6 +806,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -896,6 +900,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -978,6 +982,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1014,6 +1018,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1950,6 +1954,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -2855,6 +2859,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -737,6 +741,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -899,6 +903,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -827,6 +831,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1019,6 +1023,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -946,6 +950,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1055,6 +1059,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -675,6 +679,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -827,6 +831,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -767,6 +771,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -901,6 +905,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1249,6 +1253,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1094,6 +1098,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -734,6 +738,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -884,6 +888,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -2195,6 +2199,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1261,6 +1265,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -551,7 +555,7 @@
|
||||
<span class="n">WeightOnlyQuantRowLinear</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..quantization.mode</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">KV_CACHE_QUANT_ALGO_LIST</span><span class="p">,</span> <span class="n">QUANT_ALGO_LIST</span><span class="p">,</span>
|
||||
<span class="n">W8A8_SQ_PLUGIN_LIST</span><span class="p">,</span> <span class="n">QuantAlgo</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..quantization.utils.fp4_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">float4_sf_dtype</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..quantization.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">fp4_utils</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..top_model_mixin</span><span class="w"> </span><span class="kn">import</span> <span class="n">TopModelMixin</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.convert_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">weight_only_quantize_dict</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.generation_mixin</span><span class="w"> </span><span class="kn">import</span> <span class="n">GenerationMixin</span>
|
||||
@ -603,6 +607,7 @@
|
||||
<span class="n">LOOKAHEAD_DECODING</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">EXPLICIT_DRAFT_TOKENS</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">EAGLE</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
<span class="n">NGRAM</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
|
||||
|
||||
<div class="viewcode-block" id="SpeculativeDecodingMode.from_arguments">
|
||||
<a class="viewcode-back" href="../../../python-api/tensorrt_llm.models.html#tensorrt_llm.llmapi.SpeculativeDecodingMode.from_arguments">[docs]</a>
|
||||
@ -620,6 +625,8 @@
|
||||
<span class="k">return</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">EXPLICIT_DRAFT_TOKENS</span>
|
||||
<span class="k">elif</span> <span class="n">args</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">==</span> <span class="s2">"eagle"</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">EAGLE</span>
|
||||
<span class="k">elif</span> <span class="n">args</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">==</span> <span class="s2">"ngram"</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NGRAM</span>
|
||||
<span class="k">else</span><span class="p">:</span>
|
||||
<span class="k">assert</span> <span class="kc">False</span><span class="p">,</span> <span class="s2">"Unknown speculative_decoding_mode "</span> <span class="o">+</span> <span class="n">args</span><span class="o">.</span><span class="n">speculative_decoding_mode</span></div>
|
||||
</div>
|
||||
@ -2389,15 +2396,18 @@
|
||||
<span class="c1"># Interleave block scale for NVFP4 plugin.</span>
|
||||
<span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">list</span><span class="p">(</span><span class="n">weights</span><span class="p">):</span>
|
||||
<span class="k">if</span> <span class="n">name</span><span class="o">.</span><span class="n">endswith</span><span class="p">(</span><span class="s1">'weights_scaling_factor'</span><span class="p">):</span>
|
||||
<span class="n">ori_shape</span> <span class="o">=</span> <span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">shape</span>
|
||||
<span class="n">out_features</span><span class="p">,</span> <span class="n">in_features</span> <span class="o">=</span> <span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">shape</span>
|
||||
<span class="n">nrows</span> <span class="o">=</span> <span class="n">fp4_utils</span><span class="o">.</span><span class="n">pad_up</span><span class="p">(</span><span class="n">out_features</span><span class="p">,</span> <span class="mi">128</span><span class="p">)</span>
|
||||
<span class="n">ncols</span> <span class="o">=</span> <span class="n">fp4_utils</span><span class="o">.</span><span class="n">pad_up</span><span class="p">(</span><span class="n">in_features</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
|
||||
<span class="n">new_name</span> <span class="o">=</span> <span class="n">name</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'weights_scaling_factor'</span><span class="p">,</span>
|
||||
<span class="s1">'weights_block_scaling_factor'</span><span class="p">)</span>
|
||||
<span class="n">weights</span><span class="p">[</span><span class="n">new_name</span><span class="p">]</span> <span class="o">=</span> <span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span>
|
||||
<span class="n">weights</span><span class="p">[</span>
|
||||
<span class="n">new_name</span> <span class="o">+</span>
|
||||
<span class="s2">"_interleaved"</span><span class="p">]</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">ops</span><span class="o">.</span><span class="n">tensorrt_llm</span><span class="o">.</span><span class="n">nvfp4_block_scale_interleave</span><span class="p">(</span>
|
||||
<span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">float4_sf_dtype</span><span class="p">)</span><span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">contiguous</span><span class="p">(</span>
|
||||
<span class="p">))</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">ori_shape</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">float4_sf_dtype</span><span class="p">)</span>
|
||||
<span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">fp4_utils</span><span class="o">.</span><span class="n">float4_sf_dtype</span><span class="p">)</span><span class="o">.</span><span class="n">cpu</span><span class="p">(</span>
|
||||
<span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">())</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">nrows</span><span class="p">,</span> <span class="n">ncols</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span>
|
||||
<span class="n">fp4_utils</span><span class="o">.</span><span class="n">float4_sf_dtype</span><span class="p">)</span>
|
||||
<span class="n">weights</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="n">name</span><span class="o">.</span><span class="n">endswith</span><span class="p">(</span><span class="s1">'weights_scaling_factor_2'</span><span class="p">):</span>
|
||||
<span class="n">new_name</span> <span class="o">=</span> <span class="n">name</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">'weights_scaling_factor_2'</span><span class="p">,</span>
|
||||
@ -2650,6 +2660,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -799,6 +803,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -733,6 +737,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -801,6 +805,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -804,6 +808,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -848,6 +852,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -944,6 +948,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1247,6 +1251,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -919,6 +923,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -533,7 +537,8 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.._ipc_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">IpcMemory</span><span class="p">,</span> <span class="n">can_access_peer</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">.._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_sm_version</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.internal.runtime</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">lamport_initialize</span><span class="p">,</span>
|
||||
<span class="n">lamport_initialize_all</span><span class="p">)</span>
|
||||
<span class="n">lamport_initialize_all</span><span class="p">,</span>
|
||||
<span class="n">max_workspace_size_lowprecision</span><span class="p">)</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">..mapping</span><span class="w"> </span><span class="kn">import</span> <span class="n">Mapping</span>
|
||||
|
||||
@ -1191,7 +1196,7 @@
|
||||
<span class="sd"> Then, each instance of allreduce will reference that tensor automatically.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="n">POINTERS_PER_RANK</span> <span class="o">=</span> <span class="mi">7</span>
|
||||
<span class="n">POINTERS_OF_COUNTER</span> <span class="o">=</span> <span class="mi">2</span>
|
||||
<span class="n">POINTERS_OF_COUNTER</span> <span class="o">=</span> <span class="mi">3</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">workspace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
|
||||
@ -1225,6 +1230,17 @@
|
||||
<span class="k">return</span> <span class="mi">16_000_000</span>
|
||||
<span class="k">return</span> <span class="mi">8_000_000</span>
|
||||
|
||||
<span class="nd">@staticmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">max_workspace_size_lowprecision</span><span class="p">(</span><span class="n">tp_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="n">max_workspace_size_lowprecision</span><span class="p">(</span><span class="n">tp_size</span><span class="p">)</span>
|
||||
|
||||
<span class="nd">@staticmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">initialize_lowprecision_buffers</span><span class="p">(</span><span class="n">workspace</span><span class="p">:</span> <span class="s2">"torch.tensor"</span><span class="p">,</span>
|
||||
<span class="n">tp_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
||||
<span class="k">return</span> <span class="n">torch</span><span class="o">.</span><span class="n">ops</span><span class="o">.</span><span class="n">trtllm</span><span class="o">.</span><span class="n">initialize_static_lowprecision_buffers</span><span class="p">(</span>
|
||||
<span class="n">workspace</span><span class="p">,</span> <span class="n">tp_size</span><span class="p">)</span>
|
||||
|
||||
<span class="nd">@staticmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">allocate_workspace</span><span class="p">(</span><span class="n">mapping</span><span class="p">:</span> <span class="n">Mapping</span><span class="p">,</span>
|
||||
<span class="n">size</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">IpcMemory</span><span class="p">],</span> <span class="s2">"torch.tensor"</span><span class="p">]:</span>
|
||||
@ -1239,11 +1255,11 @@
|
||||
<span class="n">ipc_buffers_pong</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span><span class="n">mapping</span><span class="p">,</span> <span class="n">ipc_buffers_size</span><span class="p">,</span>
|
||||
<span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
<span class="n">ipc_barriers_in</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span>
|
||||
<span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span>
|
||||
<span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
<span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span> <span class="o">*</span>
|
||||
<span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">,</span> <span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
<span class="n">ipc_barriers_out</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span>
|
||||
<span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span>
|
||||
<span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
<span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span> <span class="o">*</span>
|
||||
<span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">,</span> <span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
<span class="n">lamport_buffers_size</span> <span class="o">=</span> <span class="mi">1</span> <span class="k">if</span> <span class="n">force_deterministic</span> <span class="k">else</span> <span class="n">size</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span>
|
||||
<span class="n">lamport_buffers_0</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span><span class="n">mapping</span><span class="p">,</span> <span class="n">lamport_buffers_size</span><span class="p">,</span>
|
||||
<span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
@ -1261,16 +1277,55 @@
|
||||
<span class="n">lamport_buffers_size</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
<span class="n">buffers</span> <span class="o">=</span> <span class="p">[</span>
|
||||
<span class="n">ipc_buffers_ping</span><span class="p">,</span> <span class="n">ipc_buffers_pong</span><span class="p">,</span> <span class="n">ipc_barriers_in</span><span class="p">,</span>
|
||||
<span class="n">ipc_barriers_out</span><span class="p">,</span> <span class="n">lamport_buffers_0</span><span class="p">,</span> <span class="n">lamport_buffers_1</span><span class="p">,</span>
|
||||
<span class="n">lamport_buffers_2</span>
|
||||
<span class="n">ipc_buffers_ping</span><span class="p">,</span>
|
||||
<span class="n">ipc_buffers_pong</span><span class="p">,</span>
|
||||
<span class="n">ipc_barriers_in</span><span class="p">,</span>
|
||||
<span class="n">ipc_barriers_out</span><span class="p">,</span>
|
||||
<span class="n">lamport_buffers_0</span><span class="p">,</span>
|
||||
<span class="n">lamport_buffers_1</span><span class="p">,</span>
|
||||
<span class="n">lamport_buffers_2</span><span class="p">,</span>
|
||||
<span class="c1"># Start from 1 since 0 represents released state for barrier at the beginning of the all_reduce.</span>
|
||||
<span class="c1"># The last element is the barrier flag counter.</span>
|
||||
<span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">"cuda"</span><span class="p">)</span>
|
||||
<span class="p">]</span>
|
||||
|
||||
<span class="k">return</span> <span class="n">buffers</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span>
|
||||
<span class="n">ipc_buffers_ping</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">ipc_buffers_pong</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span>
|
||||
<span class="n">ipc_barriers_in</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">ipc_barriers_out</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span>
|
||||
<span class="n">lamport_buffers_0</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">lamport_buffers_1</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span>
|
||||
<span class="n">lamport_buffers_2</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span>
|
||||
<span class="n">lamport_buffers_2</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="p">[</span><span class="n">buffers</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">data_ptr</span><span class="p">()]</span> <span class="o">+</span>
|
||||
<span class="p">[</span><span class="n">buffers</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">][</span><span class="mi">1</span><span class="p">:]</span><span class="o">.</span><span class="n">data_ptr</span><span class="p">()]</span> <span class="o">+</span> <span class="p">[</span><span class="n">buffers</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">][</span><span class="mi">2</span><span class="p">:]</span><span class="o">.</span><span class="n">data_ptr</span><span class="p">()],</span>
|
||||
<span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span>
|
||||
<span class="n">device</span><span class="o">=</span><span class="s2">"cpu"</span><span class="p">)</span>
|
||||
|
||||
<span class="nd">@staticmethod</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">allocate_lowprecision_workspace</span><span class="p">(</span>
|
||||
<span class="n">mapping</span><span class="p">:</span> <span class="n">Mapping</span><span class="p">,</span>
|
||||
<span class="n">size</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">IpcMemory</span><span class="p">],</span> <span class="s2">"torch.tensor"</span><span class="p">]:</span>
|
||||
<span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
|
||||
|
||||
<span class="c1"># Force pull mode and disable lamport when force deterministic is enabled, for reducing device memory usage.</span>
|
||||
<span class="n">is_p2p_supported</span> <span class="o">=</span> <span class="n">can_access_peer</span><span class="p">(</span><span class="n">mapping</span><span class="p">)</span>
|
||||
<span class="n">ipc_buffers_size</span> <span class="o">=</span> <span class="n">size</span>
|
||||
<span class="n">ipc_buffers_ping</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span><span class="n">mapping</span><span class="p">,</span> <span class="n">ipc_buffers_size</span><span class="p">,</span>
|
||||
<span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
<span class="n">ipc_buffers_pong</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span><span class="n">mapping</span><span class="p">,</span> <span class="n">ipc_buffers_size</span><span class="p">,</span>
|
||||
<span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
<span class="n">ipc_barriers_in</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span>
|
||||
<span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span>
|
||||
<span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
<span class="n">ipc_barriers_out</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span>
|
||||
<span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span>
|
||||
<span class="n">is_p2p_supported</span><span class="p">)</span>
|
||||
<span class="n">buffers</span> <span class="o">=</span> <span class="p">[</span>
|
||||
<span class="n">ipc_buffers_ping</span><span class="p">,</span> <span class="n">ipc_buffers_pong</span><span class="p">,</span> <span class="n">ipc_barriers_in</span><span class="p">,</span>
|
||||
<span class="n">ipc_barriers_out</span>
|
||||
<span class="p">]</span>
|
||||
|
||||
<span class="k">return</span> <span class="n">buffers</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span>
|
||||
<span class="n">ipc_buffers_ping</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">ipc_buffers_pong</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span>
|
||||
<span class="n">ipc_barriers_in</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">ipc_barriers_out</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span>
|
||||
<span class="p">[</span><span class="mi">0</span><span class="p">],</span>
|
||||
<span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span>
|
||||
<span class="n">device</span><span class="o">=</span><span class="s2">"cpu"</span><span class="p">)</span>
|
||||
|
||||
@ -1424,6 +1479,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1027,6 +1031,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1888,6 +1892,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1158,6 +1162,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -5446,6 +5450,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1105,6 +1109,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1617,6 +1621,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1010,7 +1014,9 @@
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">num_layers</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||||
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_config</span><span class="o">.</span><span class="n">num_layers</span><span class="p">(</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">world_config</span><span class="o">.</span><span class="n">pipeline_parallelism</span><span class="p">)</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">world_config</span><span class="o">.</span><span class="n">pipeline_parallelism</span><span class="p">,</span>
|
||||
<span class="bp">self</span><span class="o">.</span><span class="n">world_config</span><span class="o">.</span><span class="n">pipeline_parallel_rank</span><span class="p">,</span>
|
||||
<span class="p">)</span>
|
||||
|
||||
<span class="nd">@property</span>
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">max_sequence_length</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span>
|
||||
@ -1819,6 +1825,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -3347,6 +3351,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -965,6 +969,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -60,7 +60,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -329,6 +329,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -350,6 +351,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -414,6 +416,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -448,6 +451,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -513,6 +517,7 @@
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">pydantic</span><span class="w"> </span><span class="kn">import</span> <span class="n">BaseModel</span>
|
||||
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.bindings</span><span class="w"> </span><span class="kn">import</span> <span class="n">executor</span> <span class="k">as</span> <span class="n">tllme</span>
|
||||
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.executor.serialization</span><span class="w"> </span><span class="kn">import</span> <span class="n">register_approved_ipc_class</span>
|
||||
|
||||
|
||||
<div class="viewcode-block" id="GuidedDecodingParams">
|
||||
@ -579,6 +584,14 @@
|
||||
<span class="sd"> """</span>
|
||||
<span class="k">pass</span> <span class="c1"># noqa</span>
|
||||
|
||||
<span class="k">def</span><span class="w"> </span><span class="nf">__init_subclass__</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">"""</span>
|
||||
<span class="sd"> This method is called when a class inherits from LogitsProcessor.</span>
|
||||
<span class="sd"> """</span>
|
||||
<span class="c1"># Register subclass as an approved class for deserialization across IPC boundaries.</span>
|
||||
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">__init_subclass__</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
|
||||
<span class="n">register_approved_ipc_class</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span>
|
||||
|
||||
|
||||
<span class="k">class</span><span class="w"> </span><span class="nc">BatchedLogitsProcessor</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
|
||||
<span class="w"> </span><span class="sd">"""Base class for batched logits processor.</span>
|
||||
@ -790,20 +803,18 @@
|
||||
<span class="sd"> For instance, while the greedy decoding with n > 1 is capable in the</span>
|
||||
<span class="sd"> Executor class of C++ runtime, the LLM API disallows such combination.</span>
|
||||
<span class="sd"> '''</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">></span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o"><</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s1">'In beam search, best_of (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s1">) must be '</span>
|
||||
<span class="sa">f</span><span class="s1">'greater than or equal to n (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s1">).'</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o"><</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s2">"best_of (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s2">) cannot be less than n (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">)"</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">></span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_greedy_decoding</span> <span class="ow">and</span>
|
||||
<span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'TLLM_ALLOW_N_GREEDY_DECODING'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s1">'Greedy decoding in the LLM API does not allow multiple '</span>
|
||||
<span class="sa">f</span><span class="s1">'returns. Please set to best_of=1, got best_of=</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s1">. '</span>
|
||||
<span class="sa">f</span><span class="s1">'Please set to best_of=1 or set an environment variable '</span>
|
||||
<span class="sa">f</span><span class="s1">'TLLM_ALLOW_N_GREEDY_DECODING=1 to allow best_of > 1 '</span>
|
||||
<span class="sa">f</span><span class="s1">'under the greedy decoding.'</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">></span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_greedy_decoding</span>
|
||||
<span class="ow">and</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">'TLLM_ALLOW_N_GREEDY_DECODING'</span><span class="p">,</span> <span class="kc">None</span><span class="p">)):</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
<span class="sa">f</span><span class="s1">'Greedy decoding in the LLM API does not allow multiple '</span>
|
||||
<span class="sa">f</span><span class="s1">'returns. Please set to best_of=1, got best_of=</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s1">. '</span>
|
||||
<span class="sa">f</span><span class="s1">'Please set to best_of=1 or set an environment variable '</span>
|
||||
<span class="sa">f</span><span class="s1">'TLLM_ALLOW_N_GREEDY_DECODING=1 to allow best_of > 1 '</span>
|
||||
<span class="sa">f</span><span class="s1">'under the greedy decoding.'</span><span class="p">)</span>
|
||||
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span> <span class="o"><</span> <span class="mi">1</span><span class="p">:</span>
|
||||
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
|
||||
@ -1092,6 +1103,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -16,6 +16,12 @@ ________
|
||||
.. doxygenfile:: tensor.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
transferAgent.h
|
||||
_______________
|
||||
|
||||
.. doxygenfile:: transferAgent.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
serialization.h
|
||||
_______________
|
||||
|
||||
|
||||
75
_sources/advanced/kv-cache-management.md.txt
Normal file
75
_sources/advanced/kv-cache-management.md.txt
Normal file
@ -0,0 +1,75 @@
|
||||
(kv-cache-management)=
|
||||
|
||||
# KV Cache Management: Pools, Blocks, and Events
|
||||
|
||||
This document provides an overview of the internal hierarchy and event system for paged KV cache management, as implemented in the TensorRT-LLM codebase.
|
||||
|
||||
For more information on KV cache reuse see [KV cache reuse](kv-cache-reuse.md).
|
||||
|
||||
---
|
||||
|
||||
## Hierarchy: Pool, Block, and Page
|
||||
|
||||
### **Block**
|
||||
- **Definition:** The smallest unit of KV cache allocation. A `KVCacheBlock` holds metadata (not the actual data) for a chunk of KV cache.
|
||||
- **Purpose:** Each block represents a fixed number of tokens' worth of KV data (can be specified by `tokens_per_block` parameter).
|
||||
- **Usage:** Blocks are allocated, reused, or evicted as sequences are processed.
|
||||
|
||||
### **Page**
|
||||
- **Definition:** In this codebase, "page" is often used interchangeably with "block" (as in "paged KV cache"), but technically, a page could refer to a memory page (hardware-level), while a block is a logical unit for the cache.
|
||||
- **In Practice:** The code uses "block" as the main unit; "page" is not a distinct class or struct.
|
||||
|
||||
### **Pool**
|
||||
- **Definition:** A pool is a contiguous memory buffer (or set of buffers) that holds the actual KV data for one or more layers.
|
||||
- **Types:** There are primary pools (fast GPU memory) and secondary pools (slower, e.g., CPU or offload memory).
|
||||
- **Organization:** Each pool can serve multiple layers that share the same KV head configuration. Pools are managed by `KVCacheBlockPool` and tracked in vectors in `WindowBlockManager`.
|
||||
- **Block ↔ Pool:** Each block is an index into a pool; the pool provides the actual storage, while the block is the metadata handle.
|
||||
|
||||
### **WindowBlockManager/BlockManager**
|
||||
|
||||
TRT-LLM supports 2 complex features related to KV cache management:
|
||||
1. **Variable Group-Query Attention (VGQA)** - i.e. a different `num_kv_heads` value for different layers.
|
||||
2. **Variable Sliding Window Attention (VSWA)** - i.e. a different `attention_window_size` value for different layers.
|
||||
|
||||
In order to support both of these features, the pool management works as described below.
|
||||
|
||||
But in the simple, *most common case*, for most models, where
|
||||
1. [MHA/MQA/Non-variable GQA](gpt-attention.md#multi-head-multi-query-and-group-query-attention), i.e., same `num_kv_heads` value for all layers,
|
||||
2. Global attention/[SWA](gpt-attention.md#sliding-window-attention-cyclic-rolling-buffer-kv-cache), i.e., same `attention_window_size` value for all layers,
|
||||
|
||||
only a *single* pool will be created within the structure described below.
|
||||
|
||||
#### KV Cache Pool Management
|
||||
|
||||
- **WindowBlockManager:** Manages blocks and pools for a specific attention window size. Within a `WindowBlockManager`, there can be multiple pools - each corresponding a unique number of KV heads - i.e., to support VGQA.
|
||||
- **BlockManager:** Manages all `WindowBlockManager` instances, one per unique window size.
|
||||
|
||||
**Hierarchy Summary:**
|
||||
- **Pool** (memory buffer for KV data)
|
||||
- Contains many blocks.
|
||||
- **Blocks** (metadata for a chunk of the pool, each block = `tokens_per_block` tokens)
|
||||
- (Optionally, blocks can be swapped between primary/secondary pools.)
|
||||
- **BlockManager/WindowBlockManager**: Manage pools and blocks, handle allocation, reuse, and eviction.
|
||||
|
||||
---
|
||||
|
||||
## Events in `KVCacheEventManager`
|
||||
|
||||
The `KVCacheEventManager` is responsible for tracking and reporting significant changes in the state of the KV cache. Events are used for logging, debugging, or possibly for external monitoring.
|
||||
|
||||
### **Types of Events**
|
||||
- **Created Event:** When pools or blocks are created/allocated.
|
||||
- **Updated Event:** When a block's state changes (e.g., moved between primary/secondary, priority updated).
|
||||
- **Removed Event:** When a block is removed from the cache (evicted or released).
|
||||
- **Stored Event:** When blocks are stored for potential reuse (e.g., after a sequence finishes and its blocks are reusable).
|
||||
|
||||
### **What Triggers an Event?**
|
||||
- **Allocation/Deallocation:** Creating or freeing memory pools or blocks.
|
||||
- **Eviction/Reuse:** When a block is evicted, reused, or its priority changes.
|
||||
- **Block Movement:** When a block is moved between memory levels (primary ↔ secondary).
|
||||
- **Block Storage:** When blocks are stored for future reuse (e.g., after a sequence completes).
|
||||
|
||||
**In summary:**
|
||||
An "event" is any significant change in the lifecycle or state of a KV cache block or pool, tracked for monitoring, debugging, or optimization purposes.
|
||||
|
||||
---
|
||||
65
_sources/advanced/lowprecision-pcie-allreduce.md.txt
Normal file
65
_sources/advanced/lowprecision-pcie-allreduce.md.txt
Normal file
@ -0,0 +1,65 @@
|
||||
# Low-Precision-AllReduce
|
||||
|
||||
```{note}
|
||||
Note:
|
||||
This feature is optimized for PCIe-based GPU topologies and may affect model accuracy. Please evaluate precision impact for your specific workload.
|
||||
```
|
||||
|
||||
|
||||
TRT-LLM supports `low-precision-allreduce`, a communication optimization that accelerates AllReduce operations in PCIe-based GPU environments. This feature quantizes FP16/BF16 data to FP8 during network transmission, reducing communication volume and improving performance.
|
||||
|
||||
## Algorithm
|
||||
|
||||
The Low-Precision-AllReduce algorithm works by:
|
||||
1. Quantizing input FP16/BF16 tensors to FP8 format before network transmission
|
||||
|
||||
|
||||
**Quantization details**: We use a "per-warp" quantization approach where each CUDA warp (32 threads) processes a batch of data. In each warp, 31 threads quantize FP16/BF16 values to FP8 e4m3 format (16 bytes per thread), while the last thread transmits a scalar value. This results in each warp collectively quantizing 496 elements plus one scalar at a time.
|
||||
|
||||
2. Transmitting the quantized data through the network
|
||||
3. Dequantizing received data back to the original precision
|
||||
4. Performing the reduction operation
|
||||
|
||||
In 8-GPU scenarios, this approach shifts the communication bottleneck from cross-NUMA QPI to the PCIe switch, resulting in better overall performance.
|
||||
|
||||
## Topology Requirements
|
||||
|
||||

|
||||
|
||||
Low-Precision-AllReduce is specifically designed for the topology shown above, where:
|
||||
- Each node contains 2 NUMA domains
|
||||
- Each NUMA domain has 4 GPUs connected via PCIe switch
|
||||
- GPUs within the same NUMA node communicate via the PCIe switch
|
||||
|
||||
**Important:** This optimization will not accelerate performance in different topologies (e.g., where each GPU is in a separate NUMA domain).
|
||||
|
||||
## Usage
|
||||
|
||||
The Low-Precision-AllReduce algorithm can be enabled in two ways:
|
||||
|
||||
1. **Direct specification** in your code:
|
||||
```
|
||||
AllReduce allreduce(mapping=mapping, strategy=AllReduceStrategy.LOWPRECISION);
|
||||
```
|
||||
2. **Environment variable control** with AUTO strategy:
|
||||
```
|
||||
// In your code
|
||||
AllReduce allreduce(mapping=mapping, strategy=AllReduceStrategy.AUTO);
|
||||
// Set environment variable before running
|
||||
export FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY=1
|
||||
```
|
||||
|
||||
## Performance and Accuracy Considerations
|
||||
|
||||
Low-Precision-AllReduce reduces communication volume by using FP8 data format for transmission. This optimization:
|
||||
- Improves performance for large message sizes in PCIe-based topologies
|
||||
- May slightly reduce numerical precision
|
||||
- Automatically falls back to other strategies when no performance benefit is expected (e.g., with NVLink or small messages)
|
||||
|
||||
Users should evaluate the precision impact on their specific models and workloads.
|
||||
|
||||
## Environment Variables
|
||||
|
||||
- `FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY`: When set to `1`, forces the use of low-precision algorithm with AUTO strategy. If the algorithm determines it cannot provide performance benefits, it will automatically fall back to other strategies.
|
||||
|
||||
**Note**: When compiling TensorRT-LLM without enabling the `ENABLE_FP8` option, setting Low Precision allreduce will not take effect.
|
||||
@ -134,9 +134,8 @@ To do the benchmark, run the following command:
|
||||
YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
pytorch_backend_config:
|
||||
use_cuda_graph: true
|
||||
moe_backend: TRTLLM
|
||||
use_cuda_graph: true
|
||||
moe_backend: TRTLLM
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
@ -202,21 +201,20 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
|
||||
YOUR_DATA_PATH=./dataset.txt
|
||||
|
||||
cat >./extra-llm-api-config.yml <<EOF
|
||||
pytorch_backend_config:
|
||||
use_cuda_graph: true
|
||||
cuda_graph_padding_enabled: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 384
|
||||
print_iter_log: true
|
||||
use_cuda_graph: true
|
||||
cuda_graph_padding_enabled: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 8
|
||||
- 16
|
||||
- 32
|
||||
- 64
|
||||
- 128
|
||||
- 256
|
||||
- 384
|
||||
print_iter_log: true
|
||||
enable_attention_dp: true
|
||||
EOF
|
||||
|
||||
@ -257,8 +255,7 @@ To do the benchmark, run the following command:
|
||||
YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
pytorch_backend_config:
|
||||
use_cuda_graph: true
|
||||
use_cuda_graph: true
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
@ -307,10 +304,9 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
|
||||
YOUR_DATA_PATH=./dataset.txt
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
pytorch_backend_config:
|
||||
use_cuda_graph: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 128
|
||||
use_cuda_graph: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 128
|
||||
enable_attention_dp: true
|
||||
EOF
|
||||
|
||||
|
||||
@ -50,7 +50,7 @@ Output Sequence Length (OSL): 2k tokens
|
||||
### Model Architecture
|
||||
The base DeepSeek-R1 main model contains: 3x dense layers (initial) and 58x MoE layers, there is also 1x Multi-Tokens Prediction (MTP) layer (MoE-architecture equivalent) for speculative decoding. Our optimized configuration extends the MTP layer to 3x layers using autoregressive styling for peak performance exploration.
|
||||
|
||||
<img src="../media/tech_blog1_model_overview.png?raw=true" alt="tech_blog1_model_overview" width="500" height="auto">
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_model_overview.png?raw=true" alt="tech_blog1_model_overview" width="500" height="auto">
|
||||
|
||||
### Precision Strategy
|
||||
We have explored a mixed precision recipe, which provides a better tradeoff between accuracy and performance.
|
||||
@ -84,7 +84,7 @@ We have also explored and introduced mixed parallel strategy on 8xB200 GPUs. Spe
|
||||
### Everything in One Diagram
|
||||
Now let's put everything into one diagram, which represents a MoE layer from a decoding iteration.
|
||||
|
||||
<img src="../media/tech_blog1_model_details.png?raw=true" alt="tech_blog1_model_details" width="1600" height="auto">
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_model_details.png?raw=true" alt="tech_blog1_model_details" width="1600" height="auto">
|
||||
|
||||
|
||||
The modules in the diagram are:
|
||||
@ -136,7 +136,7 @@ The modules in the diagram are:
|
||||
| Optimize CUTLASS Flow: Sparse Experts as GEMMs | 249 | The code is not open-source yet due to the dependency with internal base environment and we are planning to make it decoupled from internal base environment thus to be able to open-source in the future.|
|
||||
| Introduce EP4TP2 for better workload balance | 253 | Use `--tp 8 --ep 4` when benchmarking |
|
||||
| Introduce moe_backend=TRTLLM, EP2TP4 for better balance | 299 | [PR #4280](https://github.com/NVIDIA/TensorRT-LLM/pull/4280) |
|
||||
| Optimize Fuse_A_GEMM and Router_GEMM | 340 | WIP: [PR #4115](https://github.com/NVIDIA/TensorRT-LLM/pull/4115) |
|
||||
| Optimize Fuse_A_GEMM and Router_GEMM | 340 | WIP |
|
||||
| Relax Acceptance | **368** | [deepseek_v3#multi-token-prediction-mtp](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#multi-token-prediction-mtp) |
|
||||
|
||||
### System Level optimizations
|
||||
@ -195,7 +195,7 @@ We have introduced multi-streams based optimizations to hide some kernels' overh
|
||||
|
||||
#### Sparse Experts as GEMMs (only works when moe_backend=CUTLASS)
|
||||
|
||||
<img src="../media/tech_blog1_sparse_exp_as_a_gemm.png?raw=true" alt="tech_blog1_sparse_exp_as_a_gemm" width="800" height="auto">
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_sparse_exp_as_a_gemm.png?raw=true" alt="tech_blog1_sparse_exp_as_a_gemm" width="800" height="auto">
|
||||
|
||||
The existing CUTLASS-based Sparse Experts flow (illustrated in the figure) dispatches input tokens to their designated experts, then applies indexed local reduction on each expert's outputs before a global allreduce. Both dispatching and indexed local reduction incur high overhead in low-latency scenarios. To address this, we propose treating "Sparse Experts as GEMMs" by sending all tokens to each activated expert and masking out unneeded outputs before local reduction. Because grouped GEMMs are memory-bound, the extra computations from redundant tokens have minimal impact, effectively eliminating the costly dispatch and reduction overhead.
|
||||
|
||||
@ -229,12 +229,12 @@ We focus on optimizing two kinds of dense GEMMs: Fuse_A_GEMM and RouterGEMM, bec
|
||||
##### Fuse_A_GEMM
|
||||
We developed a custom Fuse_A_GEMM that prefetches the majority of its weights into shared memory (enabled by PDL and overlapped with oneshot-AllReduce), significantly enhancing performance. The kernel shows substantial improvements over default GEMM implementation when num_tokens < 16.
|
||||
|
||||
<img src="../media/tech_blog1_fuse_a_gemm.png?raw=true" alt="tech_blog1_fuse_a_gemm" width="500" height="auto">
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_fuse_a_gemm.png?raw=true" alt="tech_blog1_fuse_a_gemm" width="500" height="auto">
|
||||
|
||||
##### RouterGEMM
|
||||
By leveraging our internal AI code generator, we automatically generate an optimized RouterGEMM kernel, which delivers substantial improvements over the default GEMM implementation when [num_tokens <=30](https://github.com/NVIDIA/TensorRT-LLM/pull/4115/files#diff-006ae982200a5ef2b27f4aedb526025e64406d3c2fadde329ea745793fac04edR303:~:text=and%20hidden_states.-,size,-(0))
|
||||
By leveraging our internal AI code generator, we automatically generate an optimized RouterGEMM kernel, which delivers substantial improvements over the default GEMM implementation when num_tokens <=30.
|
||||
|
||||
<img src="../media/tech_blog1_router_gemm.png?raw=true" alt="tech_blog1_router_gemm" width="500" height="auto">
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_router_gemm.png?raw=true" alt="tech_blog1_router_gemm" width="500" height="auto">
|
||||
|
||||
#### Kernel fusion
|
||||
Kernel fusion is necessary for min-latency scenario to reduce extra global memory write/read cost, and we support following fusion patterns now
|
||||
|
||||
@ -0,0 +1,252 @@
|
||||
# DeepSeek R1 MTP Implementation and Optimization
|
||||
by NVIDIA TensorRT-LLM team
|
||||
## Table of Contents
|
||||
- [MTP for inference](#mtp-for-inference)
|
||||
- [Background](#background)
|
||||
- [MTP Vanilla](#mtp-vanilla)
|
||||
- [MTP Eagle](#mtp-eagle)
|
||||
- [MTP implementation in TensorRT-LLM](#mtp-implementation-in-tensorrt-llm)
|
||||
- [Basic Implementation](#basic-implementation)
|
||||
- [MTP Modules](#mtp-modules)
|
||||
- [Attention for MTP](#attention-for-mtp)
|
||||
- [How to run DeepSeek models with MTP](#how-to-run-deepseek-models-with-mtp)
|
||||
- [MTP optimization - Relaxed Acceptance](#mtp-optimization---relaxed-acceptance)
|
||||
- [Relaxed Acceptance](#relaxed-acceptance)
|
||||
- [How to run the DeepSeek-R1 model with Relaxed Acceptance](#how-to-run-the-deepseek-r1-model-with-relaxed-acceptance)
|
||||
- [Evaluation](#evaluation)
|
||||
- [Achieving speedup with MTP speculative decoding](#achieving-speedup-with-mtp-speculative-decoding)
|
||||
- [Accuracy studies for Relaxed Acceptance](#accuracy-studies-for-relaxed-acceptance)
|
||||
- [Future Works](#future-works)
|
||||
- [Tree-based speculative decoding support](#tree-based-speculative-decoding-support)
|
||||
- [Eagle3 support](#eagle3-support)
|
||||
- [Fix known issues](#fix-known-issues)
|
||||
- [Acknowledgment](#acknowledgment)
|
||||
|
||||
|
||||
TensorRT-LLM achieves world-record inference performance for DeepSeek-R1 on NVIDIA Blackwell GPUs, where Multi-Token Prediction (MTP) delivers a significant speedup. In our [previous blog post](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md), we discussed the key optimizations that enable the outstanding inference latency of the DeepSeek-R1 model. This article dives deeper into the implementation and optimization of MTP in TensorRT-LLM.
|
||||
|
||||
## MTP for inference
|
||||
Inspired by a previous [research work](https://arxiv.org/pdf/2404.19737), MTP is designed to help the DeepSeek-V3 training. It adds additional MTP modules at the end of the main model and uses them to predict additional tokens. In this way, MTP can extend the prediction scope to multiple future tokens at each position to achieve better model accuracy. During inference, those MTP modules can also be used for speculative decoding to improve the generation latency further. In this section, we will introduce the MTP speculative decoding algorithm for LLM inference.
|
||||
|
||||
### Background
|
||||
Speculative decoding is a popular technique for faster and cost-effective LLM inference. It’s based on the premise that generating multiple future tokens(especially for decode phase which is less compute bound) is more efficient than processing a single token. Speculative decoding techniques usually divide the process into a low-cost draft stage and a parallelized verification stage. The draft stage predicts draft tokens by using a small model or a subset of layers in the main model. And the verification stage uses the main model to determine how many of these draft tokens to accept, which is far more efficient than generating one token per iteration.
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_verify_and_accept.png" alt="tech_blog2_verify_and_accept" width="1280" height="auto">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="center"><sub><em>Figure 1. Verification example</em></sub></p>
|
||||
|
||||
Figure 1 shows an example of how to verify and accept those draft tokens. Assuming there are a total of 5 draft tokens “ABCDE”, we will extend them to the input token “G”, and input a total of 6 tokens to the main model. After sampling, we can get six different expected tokens, then compare the expected tokens with the draft tokens and accept the longest prefix matched tokens. In this example, the tokens “ABC” are matched. Because “H” is predicted by the main model and the corresponding input token “C” is already accepted, “H” will also be accepted. In this way, we can accept four tokens in a single iteration. MTP also uses this method to verify and accept draft tokens.
|
||||
For the draft stage in MTP, there are two different MTP methods, MTP vanilla and MTP eagle. They can be used for different inference cases.
|
||||
|
||||
### MTP Vanilla
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_mtp_vanilla.png" alt="tech_blog2_mtp_vanilla" width="640" height="auto">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="left"><sub><em>Figure 2. MTP Vanilla, where t<sub>i</sub> is the input token, d<sub>i</sub> is the predicted draft token, K is the number of MTP modules, and h<sub>i</sub><sup>n</sup> is the hidden state of the n-th MTP module. Note that h<sub>0</sub> means the hidden states of the main model. (Disclaimer: the figures adapted from the original DeepSeek V3 tech report)</em></sub></p>
|
||||
|
||||
|
||||
MTP Vanilla method is more similar to the MTP training, and it sequentially uses different MTP modules to predict multiple draft tokens. This method can support model checkpoints with weights of multiple different MTP modules. And each MTP module will have its own KV cache.
|
||||
|
||||
Figure 2 illustrates the MTP vanilla inference. In the context phase, assuming there are a total of four input tokens, we will get the output token $t_5$ and the hidden states after the main model forward. The output token will be appended to the input tokens, then we shift out the first token to get tokens from $t_2$ to $t_5$ as the input tokens of the first MTP module. The hidden states from the main model will be directly used as the input of the first MTP module to predict the first draft token. For the next several MTP modules, we will use the same method to prepare the inputs to predict the sequential draft tokens.
|
||||
|
||||
In the generation phase, there will be a little difference. The predicted token $t_5$ and the draft tokens will be used as inputs for the main model. After the main model forward, we will do the verification to get the accepted tokens. In this example, assuming $j$ draft tokens $d_6$~$d_{j+5}$ are accepted. Then prepare the MTP module inputs. Different from the context phase, we will prepare input IDs and hidden states of a total of $K$ tokens before the last accepted token. In this example, the last accepted token is $t_{j+6}$. Then we can get the first draft token after the first MTP module forward. For the sequential MTP modules, we can prepare their inputs in a similar way to the MTP modules in the context phase, so all of those MTP modules have the same input sequence length. After predicting all of the draft tokens, we need to evict the keys/values of those rejected draft tokens from the main model's KV cache to ensure the subsequent calculation is correct.
|
||||
|
||||
### MTP Eagle
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_mtp_eagle.png" alt="tech_blog2_mtp_eagle" width="640" height="auto">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="center"><sub><em>Figure 3. MTP Eagle, using the same notation as Figure 2</em></sub></p>
|
||||
|
||||
MTP Eagle can be viewed as a variant of [Eagle](https://arxiv.org/pdf/2401.15077) speculative decoding method, but only supports chain decoding now. It reuses the same MTP module and repeats multiple times to predict draft tokens. MTP Eagle supports the model checkpoint with only one MTP module. The official DeepSeek-V3 and DeepSeek-R1 have only one MTP module in their checkpoints. Another difference with MTP vanilla is the KV cache. In the MTP Eagle method, the MTP module reuses the same KV cache when predicting multiple draft tokens.
|
||||
|
||||
Figure 3 gives an MTP Eagle example. In the context phase, the inputs of the first MTP module forward are the same as the MTP Vanilla. However, for the sequential MTP module forward, the first difference is that MTP Eagle uses the same MTP module to predict draft tokens and reuses the same KV cache. Another difference is that we only need to input the token ID and the hidden state of one token. The token is the last predicted draft token, while the hidden state is the corresponding hidden state in the last MTP module forward. In this way, we can predict total K draft tokens by using only one MTP module.
|
||||
|
||||
In the generation phase, the verification stage is the same as MTP Vanilla. After getting the accepted tokens, we will use the last accepted tokens and the corresponding hidden state as the inputs of the first MTP module forward. Compared with MTP Vanilla, it will be much easier to implement. And the sequential MTP module forwards use the same method as the context phase to prepare inputs. After predicting all of the draft tokens, we need to evict the keys/values of those rejected draft tokens from the main model's KV cache.
|
||||
|
||||
## MTP implementation in TensorRT-LLM
|
||||
### Basic Implementation
|
||||
TensorRT-LLM has two different paths for MTP, one for [MTP Vanilla](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047) and another for [MTP Eagle](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047). MTP Eagle is the default path for DeepSeek-V3 and DeepSeek-R1 models.
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_overall_workflow.png" alt="tech_blog2_overall_workflow" width="800" height="auto">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="center"><sub><em>Figure 4. MTP workflow in TensorRT-LLM</em></sub></p>
|
||||
|
||||
Figure 4 shows the overall workflow of MTP in TensorRT-LLM. Both paths share the runtime workflow, and the differences are in the MTP modules forward. In the context phase, there is no draft token in the inputs. TensorRT-LLM model engine fetches the input IDs from the requests and inputs to the model engine forward to get the next token and the hidden state. Then we prepare the MTP module inputs, and the MTP modules forward the inputs to predict the draft tokens.
|
||||
|
||||
The generation workflow is more complicated. We need to do both the verification and draft stages. The predicted new token and draft tokens are the inputs for the main model. After the main model forward, we can sample from the output logits and get the following new tokens. Then compare them with the input draft tokens to get the final accepted tokens. The verification stage will be finished here. We will use the accepted tokens and hidden states to start a new draft stage, which uses the MTP layers to predict new draft tokens for the next iteration. Finally, we need to rewind the KV cache to evict keys/values corresponding to those rejected tokens.
|
||||
|
||||
Except for the Rewind KV Cache, all of those processes are inside the model engine forward function. In this way, we can use one model engine to support MTP inference, and it would be easier for MTP to be compatible with other features, such as CUDA graph and overlap scheduler. When enabling CUDA graph, both the verification and draft stages can be captured in one graph, significantly reducing CPU overhead.
|
||||
|
||||
### MTP Modules
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_mtp_modules.png" alt="tech_blog2_mtp_modules" width="640" height="auto">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="center"><sub><em>Figure 5. MTP model architecture</em></sub></p>
|
||||
|
||||
Figure 5 introduces the basic model architecture of [MTP Vanilla](https://github.com/NVIDIA/TensorRT-LLM/blob/338744fba6a91147b739b7f02d19b37bc19aa17a/tensorrt_llm/_torch/speculative/mtp.py#L326), [MTP Eagle](https://github.com/NVIDIA/TensorRT-LLM/blob/338744fba6a91147b739b7f02d19b37bc19aa17a/tensorrt_llm/_torch/speculative/mtp.py#L1047), and the basic [MTP module](https://github.com/NVIDIA/TensorRT-LLM/blob/338744fba6a91147b739b7f02d19b37bc19aa17a/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L829) design. Because MTP vanilla needs $K$ input tokens, if the number of accepted tokens is less than the number of input tokens, i.e. $j<K$, we need to use the old token IDs and hidden states as the input of the first MTP module. To avoid bringing much additional computation overhead, we add two tensors for each request to save the past $K$ input IDs and the hidden states of past $K$ tokens, and update them by using the accepted tokens and corresponding hidden states each iteration. In this way, we can read these tensors when preparing inputs for the first MTP module. MTP Eagle implementation is much easier and straightforward, just call the same MTP module forward $K$ times to get $K$ new draft tokens.
|
||||
|
||||
The MTP module follows the design in DeepSeek-V3. The embedding layer and output head in MTP modules are shared with the main model, which can save GPU memory consumption.
|
||||
|
||||
|
||||
### Attention for MTP
|
||||
|
||||
Attention is also a very important component in supporting MTP inference. The changes are mainly in the attention kernels for the generation phase. For the normal request, there will be only one input token in the generation phase, but for MTP, there will be $K+1$ input tokens. Since MTP sequentially predicts additional tokens, the predicted draft tokens are chained. Though we have an MTP Eagle path, currently, we only have the chain-based support for MTP Eagle. So, a causal mask is enough for the attention kernel to support MTP. In our implementation, TensorRT-LLM will use the fp8 flashMLA generation kernel on Hopper GPU, while using TRTLLM customized attention kernels on Blackwell for better performance.
|
||||
|
||||
### How to run DeepSeek models with MTP
|
||||
Run DeepSeek-V3/R1 models with MTP, use [examples/pytorch/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/pytorch/quickstart_advanced.py) with additional options:
|
||||
|
||||
```bash
|
||||
cd examples/pytorch
|
||||
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N
|
||||
```
|
||||
|
||||
To benchmark min-latency performance with MTP, you need to follow [this document](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/models/core/deepseek_v3/README.md#6-dataset-preparation) to prepare your dataset, then follow the steps below:
|
||||
|
||||
```bash
|
||||
YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
use_cuda_graph: true
|
||||
moe_backend: TRTLLM
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
EOF
|
||||
|
||||
export TRTLLM_ENABLE_PDL=1
|
||||
|
||||
trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
|
||||
throughput \
|
||||
--dataset $YOUR_DATA_PATH \
|
||||
--backend pytorch \
|
||||
--num_requests 10 \
|
||||
--concurrency 1 \
|
||||
--max_batch_size 1 \
|
||||
--tp 8 \
|
||||
--ep 2 \
|
||||
--extra_llm_api_options ./extra-llm-api-config.yml
|
||||
```
|
||||
|
||||
## MTP optimization - Relaxed Acceptance
|
||||
DeepSeek-R1 is a reasoning model that first outputs some thinking tokens, after which the user can get the actual outputs. The thinking process usually takes up a lot of tokens, and the quality of the outputs of the thinking process may have a limited impact on the final answer. So we want to use a more aggressive acceptance strategy, called [relaxed acceptance](https://github.com/NVIDIA/TensorRT-LLM/pull/3865), for the thinking process to speed up the thinking decoding phase. This will be a tradeoff between speedup and output quality. From the experimental results, the impact of relaxed acceptance on output quality is limited.
|
||||
|
||||
### Relaxed Acceptance
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_relaxed_acceptance.png" alt="tech_blog2_relaxed_acceptance" width="1024" height="auto">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="center"><sub><em>Figure 6. Relaxed Acceptance example. Use MTP nextn=4 and top-3 in this example.</em></sub></p>
|
||||
|
||||
In previous verification and acceptance, we will use a top-1 to sample from the logits the main model to get the “expected” tokens as shown in Figure 1. There will be only one choice to compare with the draft tokens, which we call “Strict Acceptance”.
|
||||
|
||||
As for the Relaxed Acceptance, we first get the top-N tokens sampled from the logits, so more candidates will be compared with the input draft tokens. To make sure the accepted tokens are as accurate as possible, we also added a probability threshold, i.e., delta. We can get the token probabilities by applying a softmax to the logits. After getting the top-N tokens, we will remove tokens from the candidate list if their probability is smaller than the (top-1 probability - delta). In this way, we may get more than one token candidate, and all of those tokens are with a high probability. Then we can compare the input draft tokens with those candidates. If one of them matches, we can accept this draft token, so the acceptance rate will be increased. Figure 6 shows an example of a comparison between Strict Acceptance and Relaxed Acceptance.
|
||||
|
||||
Note that the Relaxed Acceptance will only be used during the thinking phase, while the Strict Acceptance will still be used during the non-thinking phase. And the Relaxed Acceptance only supports the DeepSeek-R1 model now.
|
||||
|
||||
|
||||
### How to run the DeepSeek-R1 model with Relaxed Acceptance
|
||||
|
||||
Run DeepSeek-R1 models with MTP Relaxed Acceptance, use [examples/pytorch/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/pytorch/quickstart_advanced.py) with additional options:
|
||||
|
||||
```bash
|
||||
cd examples/pytorch
|
||||
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N --use_relaxed_acceptance_for_thinking --relaxed_topk 10 --relaxed_delta 0.6
|
||||
```
|
||||
|
||||
To benchmark min-latency performance with MTP Relaxed Acceptance, you need to follow [this document](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/models/core/deepseek_v3/README.md#6-dataset-preparation) to prepare your dataset, then follow the steps below:
|
||||
|
||||
```bash
|
||||
YOUR_DATA_PATH=<your dataset file following the format>
|
||||
|
||||
cat >./extra-llm-api-config.yml<<EOF
|
||||
use_cuda_graph: true
|
||||
moe_backend: TRTLLM
|
||||
speculative_config:
|
||||
decoding_type: MTP
|
||||
num_nextn_predict_layers: 3
|
||||
use_relaxed_acceptance_for_thinking: true
|
||||
relaxed_topk: 10
|
||||
relaxed_delta: 0.6
|
||||
EOF
|
||||
|
||||
export TRTLLM_ENABLE_PDL=1
|
||||
|
||||
trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
|
||||
throughput \
|
||||
--dataset $YOUR_DATA_PATH \
|
||||
--backend pytorch \
|
||||
--num_requests 10 \
|
||||
--concurrency 1 \
|
||||
--max_batch_size 1 \
|
||||
--tp 8 \
|
||||
--ep 2 \
|
||||
--extra_llm_api_options ./extra-llm-api-config.yml
|
||||
```
|
||||
|
||||
## Evaluation
|
||||
### Achieving speedup with MTP speculative decoding
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_perf_and_ar.png" alt="tech_blog2_perf_and_ar" width="1280" height="auto">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="center"><sub><em>Figure 7. DeepSeek-R1-FP4 671B min-latency performance with different MTP next-n</em></sub></p>
|
||||
|
||||
We tested the min-latency (batch size = 1) performance of the DeepSeek-R1-FP4 model with different MTP next-n on a B200 node. The MLA runs with TP=8, and the MoE runs with EP=2. And there are ten different requests with ISL/OSL=1K/2K. From Figure 7, we can see that MTP=3 can help get the best min-latency performance on 8 B200 GPUs, which can bring 2.16x speedup compared with the baseline nextn=0. And with the help of the relaxed acceptance, the min-latency performance can be further improved to achieve a 2.33x speedup. We also evaluated the CUDA graph and overlap scheduler benefits. For such a min-latency case, CUDA graph can achieve a 7.22x average speedup, while the overlap scheduler can achieve 1.03x average latency.
|
||||
|
||||
### Accuracy studies for Relaxed Acceptance
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_acc_relaxed_acceptance.png" alt="tech_blog2_acc_relaxed_acceptance" width="800" height="auto">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="center"><sub><em>Figure 8. Ablation results for the Relaxed Acceptance. Using MTP nextn=3, top-10, and delta=0.6.</em></sub></p>
|
||||
|
||||
We validated the Relaxed Acceptance on different datasets. In Figure 8, we show the ablation results for Relaxed Acceptance by using the DeepSeek-R1-FP4 model. Compared with Strict Acceptance, the impact of Relaxed Acceptance on output quality is limited, resulting in only a slight accuracy drop.
|
||||
|
||||
## Future Works
|
||||
### Tree-based speculative decoding support
|
||||
|
||||
<div align="center">
|
||||
<figure>
|
||||
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_tree_spec_decoding.png" alt="tech_blog2_tree_spec_decoding" width="800" height="auto">
|
||||
</figure>
|
||||
</div>
|
||||
<p align="center"><sub><em>Figure 9. Comparison between the chain-based and tree-based speculative decoding</em></sub></p>
|
||||
|
||||
TensorRT-LLM PyTorch backend can only support chain-based speculative decoding now, both MTP Vanilla and MTP Eagle. However, the tree-based speculative decoding technique is widely used in previous advanced methods, such as Ealge2 and Eagle3, to increase the acceptance rate. MTPs in TensorRT-LLM can also be extended to support the tree-based technique. Figure 9 compares the chain-based method with the tree-based method. Both full tree and dynamic tree methods can help expand the candidate combinations, so that we can have more choices for the draft tokens.
|
||||
|
||||
### Eagle3 support
|
||||
|
||||
Another important method is Eagle3. From the [Eagle3 paper](https://arxiv.org/pdf/2503.01840), the promising results show that it can help greatly increase the acceptance rate by leveraging different levels’ hidden states to predict draft tokens. Since TensorRT-LLM already has [Eagle-3 support](https://github.com/NVIDIA/TensorRT-LLM/pull/3035) now, in the future, we also want to train an Eagle3 head to support DeepSeek-V3/R1+Eagle3 to achieve better speedup.
|
||||
|
||||
### Fix known issues
|
||||
|
||||
There are still some known issues, and we will fix them soon:
|
||||
- The MTP vanilla path has a known accuracy issue. We will fix it and refactor the MTP vanilla implementation.
|
||||
- The MTP Eagle is non-deterministic now.
|
||||
- An accuracy issue when enabling MTP and attention DP together.
|
||||
|
||||
|
||||
## Acknowledgment
|
||||
|
||||
This was a remarkable cross-team effort to support and optimize MTP in TensorRT-LLM. We would like to extend our gratitude to everyone who contributed to making this possible, as it involved a typical system/algorithm co-design approach spanning multiple technical layers—including kernel optimization, runtime enhancements, algorithmic improvements, and performance measurement & analysis. And a special thanks goes to the DeepSeek team for developing the MTP method, which lays down the foundation of this blog.
|
||||
@ -0,0 +1,174 @@
|
||||
# Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers
|
||||
|
||||
By NVIDIA TensorRT-LLM team
|
||||
## Table of Contents
|
||||
- [Introduction](#introduction)
|
||||
- [Precision strategy](#precision-strategy)
|
||||
- [Parallel strategy](#parallel-strategy)
|
||||
- [Weights absorb and MQA](#weights-absorb-and-mqa)
|
||||
- [Data Parallel for Attention module (ADP)](#data-parallel-for-attention-module-adp)
|
||||
- [Expert parallel for MoE (EP)](#expert-parallel-for-moe-ep)
|
||||
- [MLA Layers Optimizations](#mla-layers-optimizations)
|
||||
- [MoE Layers Optimizations](#moe-layers-optimizations)
|
||||
- [Runtime Optimizations](#runtime-optimizations)
|
||||
- [How to reproduce](#how-to-reproduce)
|
||||
- [Future Works](#future-works)
|
||||
- [Acknowledgment](#acknowledgment)
|
||||
|
||||
## Introduction
|
||||
The open source DeepSeek R1 model's innovative architecture including the multi-head latent attention (MLA) and large sparse Mixture-of-Experts (MoE) significantly improved the inference efficiency of the LLM models. However, harnessing the full potential of such an innovative structure requires equally important hardware/software co-optimization. This post delves into the optimization strategies for DeepSeek R1 throughput oriented scenarios (TPS/GPU), developed by NVIDIA within TensorRT-LLM on NVIDIA's Blackwell B200 GPUs. We will explore the rationale behind each enhancement. [The other min-latency optimization blog](./blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) explained in detail how TensorRT-LLM optimizes the R1 performance to achieve the best of the TPS/USER.
|
||||
|
||||
These optimizations have significantly boosted DeepSeek R1 throughput on Blackwell. Performance increased from approximately 2000 TPS/GPU in February to 4600 TPS/GPU on ISL/OSL 1K/2K dataset. The optimizations are general and applicable to other ISL/OSL configs too. These optimization items were broadly categorized into three areas: MLA layers, MoE layers, and runtime.
|
||||
|
||||
## Precision strategy
|
||||
|
||||
The mixed precision recipe for DeepSeek R1 throughput scenario is almost the same as [what](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md#precision-strategy) is used for latency oriented scenario, with the following differences:
|
||||
|
||||
* FP8 KV cache and FP8 attention, rather than BF16 precision.
|
||||
* FP4 Allgather for better communication bandwidth utilization.
|
||||
|
||||
The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT-LLM implementations are:
|
||||
|
||||
| Precision | GPQA Diamond | MATH-500
|
||||
| :-- | :-- | :-- |
|
||||
| TensorRT-LLM FP8 | 0.697 | 0.954 |
|
||||
| TensorRT-LLM FP4 | 0.705 | 0.96 |
|
||||
|
||||
** Note there are some run-to-run variance for these evaluations, so FP4 data is slight higher here. We think FP4 has comparable accuracy with FP8 on these datasets.
|
||||
|
||||
The MoE layers inside this checkpoint have been quantized into FP4. Quantizing the MoE layer weights into FP4 has the following benefits:
|
||||
|
||||
* Fully utilize the 5th generation Tensor Core FLOPS of the NVIDIA Blackwell GPUs
|
||||
* Reduce the memory load needs of the weights by almost half for MoE. Since the MoE parts are still memory bound for the decoding phase for the scenario, and 97% of the weights in the DeepSeek R1 model are from MoE layers.
|
||||
* Reduce the memory footprint of the model weights, thus freeing more GPU memories for KV cache and then increasing the max concurrency. [The original FP8 model checkpoint of the DeepSeek R1 model](https://huggingface.co/deepseek-ai/DeepSeek-R1) is about 640GB, while the NVIDIA provided [DeepSeek R1 FP4 quantized model](https://huggingface.co/nvidia/DeepSeek-R1-FP4) is only about 400 GB.
|
||||
|
||||
The precision of FP8 KV cache and FP8 attention kernels are evaluated on the GSM8K dataset, with no obvious accuracy drops. For the accuracy numbers, please see the table in the FP8 KV cache section. Users can still opt-out to use BF16 KV cache and attention if on their dataset some accuracy differences are observed.
|
||||
|
||||
## Parallel strategy
|
||||
|
||||
The parallelism strategy for DeepSeek R1 throughput scenario is different from what is used for latency-oriented scenarios.
|
||||
|
||||
| Components | Parallel Patterns |
|
||||
| :---- | :---- |
|
||||
| Attention Modules | Data Parallelism 8 (DP8) |
|
||||
| MoE Sparse Experts | Expert Parallelism 8 (EP8) |
|
||||
| MoE Shared Experts | DP8 |
|
||||
| Fuse_A GEMM | DP8 |
|
||||
| Router GEMM | DP8 |
|
||||
|
||||
In the following sections we will explain the rationale why DP and EP are chosen and not using tensor parallel (TP).
|
||||
|
||||
### Weights absorb and MQA
|
||||
|
||||
The core idea of MLA is the low-rank joint compression for the attention keys and values to reduce KV-cache size during the inference. Based on the MLA formulas, the down-projected KV latent is up-projected to multiple heads and combined with the up-projected Q to establish a normal multi-head attention (MHA). Due to the nature of the matrix multiplication, the up projection weights matrix of the K (W^UK) can be multiplied by the up-projection weights matrix of Q (W^Q) firstly, the computed results of these 2 can be then multiplied to Q. The up-projection weights matrix of V (W^UV) and the attention output projection matrix W^O can also be multiplied after the attention output. The DeepSeek-V2 technical report calls this technique "absorb". After the weights are absorbed, the MLA is equivalent to multiple query attention(MQA). Please see the [original DeepSeek-V2 technical paper](https://arxiv.org/pdf/2405.04434) for the detailed formulas and explanations, the following block diagram shows the computational flow of weights absorbed MLA in TensorRT-LLM.
|
||||

|
||||
|
||||
For the decoding phase, the weights absorb significantly reduces the math FLOPS needed to up project the K and V, since the FLOPs needed for these up projections of KV are linear to the KV cache length, while length of Q vector is always 1 in the decoding phase. The longer the KV cache history is, the more FLOPs are needed, and the up projections are repeated for every decoded token since only the projected KV latent were saved, which further increases the FLOPs needed.
|
||||
For the prefill phase, the weights absorbed version changes the dimensions of Q and KV thus increasing the number of FLOPs for attention. Based on roofline analysis, non absorbed version is beneficial for the prefill phase with input length 256 or larger
|
||||
The TensorRT-LLM MLA implementation chooses different highly optimized kernels for prefill and decoding, see [MLA](../../../../tensorrt_llm/_torch/modules/attention.py).
|
||||
|
||||
### Data Parallel for Attention module (ADP)
|
||||
|
||||
The intuition of choosing attention DP is that doing TP for the MQA (where different GPUs compute different attention Q heads) will duplicate the KV cache memory, which limits the concurrency being achieved by the system. The duplication factor is equal to the TP group size, thus 8x for TP8. Small concurrency will hurt the throughput for the powerful system like NVIDIA DGX B200.
|
||||
|
||||
For DeepSeek R1 FP4 checkpoint with 8 B200 GPUs, the weights and activation occupies about 80 GB memory for each GPU, and the free KV cache per GPU will be 100GB. Assuming ISL 1K, OSL 2K, each request will consume about 200MB KV cache, which results in a per GPU max concurrency of 500. A single node 8xGPU system has a global concurrency of 4000. When using attention TP, the global concurrency will become just 500.
|
||||
|
||||
Silicon experiments show the attention DP technique provides a significant **400% speedup** in the max throughput cases, when keeping all other factors the same.
|
||||
|
||||
### Expert parallel for MoE (EP)
|
||||
|
||||
The DeepSeek R1 MoE design features 256 small sparse experts and 1 shared expert, the GEMM problem size of these experts are as follows.
|
||||
|
||||
| GEMM | group | GEMM N | GEMM K |
|
||||
| :---- | :---- | :---- | :---- |
|
||||
| shared_fc1 | 1 | 4096 | 7168 |
|
||||
| shared_fc2 | 1 | 7168 | 2048 |
|
||||
| sparse_fc1 | 256 | 4096 | 7168 |
|
||||
| sparse_fc2 | 256 | 7168 | 2048 |
|
||||
|
||||
These experts can be done in either Tensor-Parallelism or Expert-Parallelism ways. Our current ablation study reveals that Expert-Parallelism achieves better GEMM FLOPS because it has better GEMM problem sizes. And Expert-Parallelism can save GPU communication bandwidth compared to AllReduce, because the tokens only need to be sent to GPUs where the active experts for this token are located, while TP needs an AllReduce for all the tokens between all the GPUs. Also to be noted that, to scale the DeepSeek R1 inference to systems like GB200 NVL72 fully utilizing the aggregated memory bandwidth and tensor core flops, large EPs are needed. We are actively working on implementing it.
|
||||
|
||||
Silicon performance measurements show that Expert-Parallelism can provide 142% speedup for 1K/2K max throughput case, when keeping other factors the same.
|
||||
|
||||
## MLA Layers Optimizations
|
||||
|
||||
Other than the parallel strategy and precision strategy we explained above, we have done the following optimizations for layers/kernels inside the MLA module.
|
||||
|
||||
* Attention Kernels Optimization
|
||||
|
||||
This provided a **20% E2E speedup** compared to February baseline implementation. It involved implementing **high-throughput generation MLA kernels**. Techniques include using 2CTA Group variant of the Tensor Core 5th MMA instructions of Blackwell GPUs, overlapping MLA with softmax using interleaved tiles, and fine-tuning kernel selection heuristics for the DeepSeek R1 problem size.
|
||||
|
||||
* FP8 KV Cache
|
||||
|
||||
An important optimization that yielded a **6% E2E throughput increase** when assuming the concurrency was identical. Another benefit of FP8 KV cache is **compressing the KV cache size by half**, which **allows for larger concurrency**. It also enables the use of faster FP8 attention kernels compared to BF16. We recommend that users always turn on FP8 KV cache to get better performance. In the context phase, KV is quantized to FP8 and saved to the KV cache pool. In the generation phase, both Q and KV are quantized to FP8, and FP8 Multi-Query Attention (MQA) is used. Evaluation on GSM8k showed **no meaningful accuracy drop**. The quantization typically uses static per-tensor FP8 with a scaling factor defaulting to 1.0, but KV cache scaling factor can also be generated by calibrating on a target dataset. Below are the accuracy metrics of different combinations on the GSM8K dataset.
|
||||
|
||||
| KV Cache Type | FP8 Checkpoint | FP4 Checkpoint |
|
||||
| :---- | :---- | :---- |
|
||||
| BF16 MLA and KV cache | 0.9629 | 0.9606 |
|
||||
| FP8 MLA and KV cache | 0.9613 | 0.9606 |
|
||||
|
||||
|
||||
* Manual GEMM tactics tuning
|
||||
|
||||
This optimization addresses cases where the default heuristic algorithm in cuBLAS is not performing best for specific GEMM shapes existing in the model. We built an internal tool to find the best algorithm for these specific shapes offline and then used the `cublasLtMatmul` API to apply this specific, optimized algorithm at runtime. This is a necessary system optimization when general-purpose heuristics don't find the most efficient kernel for all specific cases. We are also working actively with the cuBLAS team to further enhance the heuristics such that the best performance can always be achieved OOTB. See [cublasScaledMM.cpp](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/thop/cublasScaledMM.cpp#L54) for the tuning details.
|
||||
|
||||
* Horizontal Fusions
|
||||
|
||||
This involves fusing GEMM operations of down projection of Q/KV and rope dimensions of K tensor. See [modeling_deepseekv3.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L1305) for details. Horizontal fusion reduces the kernel launch overhead and increases the GEMM problem sizes which can achieve better HW utilization. It is a common technique shared by both min-latency and throughput optimizations.
|
||||
|
||||
* 2-stream optimizations
|
||||
|
||||
There are some small operations which can be run in parallel like the Q norm and KV norm inside the MLA. These operations cannot fully utilize the GPU math flops and the memory bandwidth, thus running in parallel CUDA streams can bring speed-up.
|
||||
|
||||
## MoE Layers Optimizations
|
||||
|
||||
The following optimizations are already done for MoE layers.
|
||||
|
||||
* Mix I/O data type for the router GEMM
|
||||
|
||||
Achieved a **4% E2E speedup** by avoiding casting operations and performing the GEMM using a mixture of input and output data types (e.g., BF16 input and FP32 output) directly. This eliminates the need to explicitly cast inputs to the output type and saves memory bandwidth.
|
||||
|
||||
* Top-K Kernels Fusions
|
||||
|
||||
Resulted in a **7.4% E2E speedup**. For DeepSeek R1, selecting the top 8 experts from 256 is done in a two-phase approach: first selecting top groups, then finding the top 8 within those groups. DeepSeek R1 uses some additional techniques for better expert load balance which involves adding bias and scales to the topK complications. All these operations resulted in 18 PyTorch ops when not fused, see [Deepseekv3RoutingImpl](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L213). Fusing the multiple kernels involved in these Top-K calculations significantly reduces the overall computation time. Compared to using 18 native PyTorch ops, fusion can reduce the operation to as few as 2 kernels. Based on the measurement on B200, fusing these kernels can reduce the kernel time from 252us to 15us in the target setting.
|
||||
|
||||
* FP4 AllGather Optimizations
|
||||
|
||||
Showed a **4% E2E speedup**. This optimization replaces the BF16 AllGather operation with an FP4 version. Using a lower precision for this communication primitive reduces the amount of data transferred over the network, significantly improving communication efficiency. Also, since the original BF16 Tensor to be transferred will get cast into FP4 format after the AllGather communication, this optimization will not bring any impact to the accuracy. At the kernel level, we are seeing about 3x when switching from BF16 to FP4 AllGather.
|
||||
|
||||
* CUTLASS Group GEMM optimizations
|
||||
|
||||
Provided a **1.3% E2E speedup**. There are some CUTLASS level optimizations shared by both min-latency and throughput cases. Just updating CUTLASS to the latest version gives us 13% kernel improvement for the MoE groupGemm, and resulted in +1.3% E2E TPS/GPU.
|
||||
|
||||
* Multi-stream optimizations
|
||||
Running the shared and routed experts in 2 streams combined with other multi-streaming optimizations in the MLA modules, contributing a **5.3% E2E speedup**.
|
||||
|
||||
## Runtime Optimizations
|
||||
|
||||
These optimizations target the overall execution flow, scheduling, and resource management within the inference system. They are shared between DeepSeek R1 models and other models supported in the TensorRT-LLM, here we are sharing some ablation study for the performance benefits on DeepSeek R1 on B200.
|
||||
|
||||
* CUDA Graph
|
||||
|
||||
This had a significant **22% E2E performance impact** for throughput scenarios. CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed. There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation. Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_padding_enabled` to false, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
|
||||
|
||||
* Overlap Scheduler:
|
||||
|
||||
Showed a **4% E2E performance impact** and should generally **always be used**. This scheduler manages the execution of different operations (like computation and communication) to overlap them effectively on the GPU and network. The intuition is to hide latency by performing computation while waiting for data transfers or vice versa, improving overall hardware utilization. The overlap schedule is already defaulted on in TensorRT-LLM by [commit](https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428#diff-3c4f29d6594b37af0f1fbb97f5291b18e49f3f2510f9d296c7adb2829e9da0bf). In case there are corner cases where it does not work, users can still opt-out this feature by set *disable_overlap_scheduler* to true.
|
||||
|
||||
* Memory Optimizations
|
||||
|
||||
Resulted in a **4GB improvement**. This includes techniques like chunked MoE (specifically for Hopper) and fixing a cuda context init bug. These methods reduce the memory footprint of the model weights or intermediate tensors, allowing for larger batch sizes or sequence lengths, and preventing Out-of-Memory (OOM) errors.
|
||||
|
||||
## How to reproduce
|
||||
|
||||
See [Perf practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md#b200-max-throughput)
|
||||
|
||||
## Future Works
|
||||
|
||||
- Large EP
|
||||
- Chunked context
|
||||
- More communication overlap
|
||||
|
||||
## Acknowledgment
|
||||
|
||||
The substantial throughput advancements for DeepSeek R1 on Blackwell GPUs, as detailed in this post, are the fruit of a dedicated and collaborative engineering effort. Achieving nearly a 2.3x increase in TPS/GPU required a deep dive into MLA layers, MoE layers, and runtime optimizations. We extend our sincere appreciation to all the engineers involved in this intensive optimization process. Their collective expertise in pushing the boundaries of throughput performance within TensorRT-LLM has been instrumental. We trust that sharing these specific strategies for maximizing throughput will prove beneficial to the developer community as they tackle demanding LLM inference workloads on NVIDIA hardware.
|
||||
@ -20,6 +20,7 @@ The LLM API can be used for both offline or online usage. See more examples of t
|
||||
llm_inference_async
|
||||
llm_inference_distributed
|
||||
llm_logits_processor
|
||||
llm_eagle2_decoding
|
||||
llm_inference_kv_events
|
||||
llm_lookahead_decoding
|
||||
llm_quantization
|
||||
|
||||
@ -11,6 +11,7 @@ LLM Examples
|
||||
llm_inference_async
|
||||
llm_inference_distributed
|
||||
llm_logits_processor
|
||||
llm_eagle2_decoding
|
||||
llm_inference_kv_events
|
||||
llm_lookahead_decoding
|
||||
llm_quantization
|
||||
|
||||
8
_sources/examples/llm_eagle2_decoding.rst.txt
Normal file
8
_sources/examples/llm_eagle2_decoding.rst.txt
Normal file
@ -0,0 +1,8 @@
|
||||
Generate Text Using Eagle2 Decoding
|
||||
===================================
|
||||
|
||||
Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_eagle2_decoding.py.
|
||||
|
||||
.. literalinclude:: ../../../examples/llm-api/llm_eagle2_decoding.py
|
||||
:language: python
|
||||
:linenos:
|
||||
@ -104,6 +104,7 @@ Welcome to TensorRT-LLM's Documentation!
|
||||
advanced/inference-request.md
|
||||
advanced/lora.md
|
||||
advanced/expert-parallelism.md
|
||||
advanced/kv-cache-management.md
|
||||
advanced/kv-cache-reuse.md
|
||||
advanced/speculative-decoding.md
|
||||
advanced/disaggregated-service.md
|
||||
@ -144,6 +145,7 @@ Welcome to TensorRT-LLM's Documentation!
|
||||
blogs/quantization-in-TRT-LLM.md
|
||||
blogs/XQA-kernel.md
|
||||
blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
|
||||
blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
|
||||
|
||||
|
||||
Indices and tables
|
||||
|
||||
@ -131,3 +131,23 @@ API Reference
|
||||
:undoc-members:
|
||||
:special-members: __init__
|
||||
:show-inheritance:
|
||||
.. autoclass:: tensorrt_llm.llmapi.NGramDecodingConfig
|
||||
:members:
|
||||
:undoc-members:
|
||||
:special-members: __init__
|
||||
:show-inheritance:
|
||||
.. autoclass:: tensorrt_llm.llmapi.LlmArgs
|
||||
:members:
|
||||
:undoc-members:
|
||||
:special-members: __init__
|
||||
:show-inheritance:
|
||||
.. autoclass:: tensorrt_llm.llmapi.TorchLlmArgs
|
||||
:members:
|
||||
:undoc-members:
|
||||
:special-members: __init__
|
||||
:show-inheritance:
|
||||
.. autoclass:: tensorrt_llm.llmapi.TrtLlmArgs
|
||||
:members:
|
||||
:undoc-members:
|
||||
:special-members: __init__
|
||||
:show-inheritance:
|
||||
|
||||
@ -628,8 +628,7 @@ If you would like to force the KV cache quantizaton, you can specify the followi
|
||||
when the checkpoint precision is `null`:
|
||||
|
||||
```yaml
|
||||
pytorch_backend_config:
|
||||
kv_cache_dtype: "fp8"
|
||||
kv_cache_dtype: "fp8"
|
||||
```
|
||||
|
||||
```{tip}
|
||||
|
||||
@ -200,12 +200,9 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
|
||||
|
||||
`llm_options.yml`
|
||||
```yaml
|
||||
|
||||
pytorch_backend_config:
|
||||
enable_overlap_scheduler: true
|
||||
use_cuda_graph: true
|
||||
cuda_graph_padding_enabled: true
|
||||
cuda_graph_batch_sizes:
|
||||
use_cuda_graph: true
|
||||
cuda_graph_padding_enabled: true
|
||||
cuda_graph_batch_sizes:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
|
||||
@ -16,7 +16,7 @@ The following sections explain how to use these implementations and provide a br
|
||||
|
||||
|
||||
There are currently three available attention backends: the vanilla backend, the TRT-LLM backend, and the Flashinfer backend.
|
||||
You can specify the desired attention backend using `PyTorchConfig.attn_backend`. For instance, to utilize the Flashinfer backend, you can create a `PyTorchConfig` with `attn_backend = "flashinfer"` and then pass it to the `LLM` constructor as follows: `LLM(pytorch_backend_config=pytorch_config)`. This will enable the use of the Flashinfer backend for your model.
|
||||
You can specify the desired attention backend using `PyTorchConfig.attn_backend`. For instance, to utilize the Flashinfer backend, you can pass `attn_backend="flashinfer"` to the `LLM` constructor as follows: `LLM(attn_backend="flashinfer")`. This will enable the use of the Flashinfer backend for your model.
|
||||
|
||||
The vanilla backend, `VanillaAttention`, is a reference implementation designed primarily for inflight batching and linear KV cache support. While it serves as a useful baseline, it is not recommended for production use due to its limited optimizations.
|
||||
|
||||
|
||||
@ -4,6 +4,8 @@ In Transformer-based models, the KV (Key-Value) Cache is a mechanism used to opt
|
||||
Since KV Cache requires memory to store, it is also an important resource.
|
||||
In TensorRT-LLM, KV Cache is managed by the `KVCacheManager`.
|
||||
|
||||
For details of the TensorRT-LLM `KVCacheManager` implementation see [KV Cache Management](../advanced/kv-cache-management.md).
|
||||
|
||||
## KV Cache Manager Introduction
|
||||
|
||||
`KVCacheManager` is a type of resource manager, inheriting from `BaseResourceManager`.
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -63,7 +63,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -336,6 +336,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -357,6 +358,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -421,6 +423,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Disaggregated-Service (experimental)</a></li>
|
||||
@ -455,6 +458,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -801,6 +805,15 @@ export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -63,7 +63,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -336,6 +336,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -357,6 +358,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -421,6 +423,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -455,6 +458,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -806,6 +810,15 @@ the TensorRT-LLM C++ Executor API.</p>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -51,19 +51,19 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
<link rel="icon" href="../_static/favicon.png"/>
|
||||
<link rel="index" title="Index" href="../genindex.html" />
|
||||
<link rel="search" title="Search" href="../search.html" />
|
||||
<link rel="next" title="KV cache reuse" href="kv-cache-reuse.html" />
|
||||
<link rel="next" title="KV Cache Management: Pools, Blocks, and Events" href="kv-cache-management.html" />
|
||||
<link rel="prev" title="Run gpt-2b + LoRA using Executor / cpp runtime" href="lora.html" />
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -336,6 +336,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -357,6 +358,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -421,6 +423,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -455,6 +458,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -550,11 +554,11 @@
|
||||
</div>
|
||||
</a>
|
||||
<a class="right-next"
|
||||
href="kv-cache-reuse.html"
|
||||
href="kv-cache-management.html"
|
||||
title="next page">
|
||||
<div class="prev-next-info">
|
||||
<p class="prev-next-subtitle">next</p>
|
||||
<p class="prev-next-title">KV cache reuse</p>
|
||||
<p class="prev-next-title">KV Cache Management: Pools, Blocks, and Events</p>
|
||||
</div>
|
||||
<i class="fa-solid fa-angle-right"></i>
|
||||
</a>
|
||||
@ -673,6 +677,15 @@
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -63,7 +63,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -336,6 +336,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -357,6 +358,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -421,6 +423,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -455,6 +458,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -991,6 +995,15 @@ is computed as:</p>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -63,7 +63,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -336,6 +336,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -357,6 +358,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -421,6 +423,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -455,6 +458,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -1031,6 +1035,15 @@ The <code class="docutils literal notranslate"><span class="pre">GptDecoder</spa
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
<script>
|
||||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
|
||||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
|
||||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||||
false;
|
||||
</script>
|
||||
@ -63,7 +63,7 @@
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
<meta name="docsearch:version" content="0.20.0rc3" />
|
||||
<meta name="docsearch:version" content="0.21.0rc0" />
|
||||
|
||||
|
||||
</head>
|
||||
@ -336,6 +336,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -357,6 +358,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
|
||||
@ -421,6 +423,7 @@
|
||||
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Graph Rewriting Module</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
@ -455,6 +458,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</nav></div>
|
||||
@ -852,6 +856,15 @@ techniques to optimize the underlying graph. It provides a wrapper similar to P
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
<div class="extra_footer">
|
||||
|
||||
<p>Last updated on June 03, 2025.</p>
|
||||
|
||||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
|
||||
|
||||
</div></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user