Update latest GitHub pages to v0.21.0rc1

This commit is contained in:
Kaiyu Xie 2025-06-11 02:46:37 +00:00
parent 1043c814df
commit 3db2a831c2
188 changed files with 3638 additions and 1165 deletions

View File

@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 5c850ce0a6f2d0ce79a91d25fbeeb241
config: 6d408ca198781361fe3feb19254966dc
tags: 645f666f9bcd5a90fca523b33c5a78b7

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -12925,9 +12925,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -9589,24 +9589,21 @@ one more than decoding draft tokens for prediction from primary head </p>
<div class="breathe-sectiondef docutils container">
<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig">
<span id="_CPPv3N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"></span><span id="_CPPv2N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"></span><span id="tensorrt_llm::runtime::IGptDecoderBatched::setup__executor::DecodingModeCR.SizeType32.SizeType32.SizeType32.SizeType32.SizeType32.SizeType32.nvinfer1::DataType.ModelConfigCR.WorldConfigCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1IGptDecoderBatched_1a78998c48fc4fd319ea2696de8b0bfb1a"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">setup</span></span></span><span class="sig-paren">(</span>
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig">
<span id="_CPPv3N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"></span><span id="_CPPv2N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"></span><span id="tensorrt_llm::runtime::IGptDecoderBatched::setup__executor::DecodingModeCR.SizeType32.SizeType32.SizeType32.nvinfer1::DataType.ModelConfigCR.WorldConfigCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1IGptDecoderBatched_1a5ef9dff42e3a44389c190c14914b8458"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">setup</span></span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><a class="reference internal" href="executor.html#_CPPv4N12tensorrt_llm8executorE" title="tensorrt_llm::executor"><span class="n"><span class="pre">executor</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="executor.html#_CPPv4N12tensorrt_llm8executor12DecodingModeE" title="tensorrt_llm::executor::DecodingMode"><span class="n"><span class="pre">DecodingMode</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">mode</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxBatchSize</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxBeamWidth</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxAttentionWindow</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">sinkTokenLength</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxSequenceLength</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxTokensPerStep</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv48nvinfer1" title="nvinfer1"><span class="n"><span class="pre">nvinfer1</span></span></a><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">DataType</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">dtype</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime11ModelConfigE" title="tensorrt_llm::runtime::ModelConfig"><span class="n"><span class="pre">ModelConfig</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">modelConfig</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime11WorldConfigE" title="tensorrt_llm::runtime::WorldConfig"><span class="n"><span class="pre">WorldConfig</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">worldConfig</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig" title="Link to this definition">#</a><br /></dt>
<dd><p>Setup the decoder before calling <code class="docutils literal notranslate"><a class="reference internal" href="#classtensorrt__llm_1_1runtime_1_1IGptDecoderBatched_1a57f2c8ee8a7a6cdb36d93c40ff04d7cc"><span class="std std-ref"><span class="pre">forward()</span></span></a></code></p>
<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig" title="Link to this definition">#</a><br /></dt>
<dd><p>Setup the decoder before calling <code class="docutils literal notranslate"><a class="reference internal" href="#classtensorrt__llm_1_1runtime_1_1IGptDecoderBatched_1ab71a988f92d801a763c8b7b960fd0769"><span class="std std-ref"><span class="pre">forward()</span></span></a></code></p>
</dd></dl>
<dl class="cpp function">
@ -9623,28 +9620,30 @@ one more than decoding draft tokens for prediction from primary head </p>
</dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::IGptDecoderBatched::forwardAsync__decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1IGptDecoderBatched_1a90191ce0e50206fb5cb1f7fb3aa2acda"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime9CudaEventE" title="tensorrt_llm::runtime::CudaEvent"><span class="n"><span class="pre">CudaEvent</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forwardAsync</span></span></span><span class="sig-paren">(</span>
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::IGptDecoderBatched::forwardAsync__decoder::DecoderStateCR.decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1IGptDecoderBatched_1a654fbc257f26b53dadb65937899938c0"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime9CudaEventE" title="tensorrt_llm::runtime::CudaEvent"><span class="n"><span class="pre">CudaEvent</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forwardAsync</span></span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoderE" title="tensorrt_llm::runtime::decoder"><span class="n"><span class="pre">decoder</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE" title="tensorrt_llm::runtime::decoder::DecoderState"><span class="n"><span class="pre">DecoderState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">decoderState</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE" title="tensorrt_llm::runtime::decoder_batch::Output"><span class="n"><span class="pre">Output</span></span></a><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">output</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE" title="tensorrt_llm::runtime::decoder_batch::Input"><span class="n"><span class="pre">Input</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">input</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<dd><p>Run one step for all requests without blocking the host process and return the token for synchronization. </p>
</dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::IGptDecoderBatched::forward__decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1IGptDecoderBatched_1a57f2c8ee8a7a6cdb36d93c40ff04d7cc"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forward</span></span></span><span class="sig-paren">(</span>
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::IGptDecoderBatched::forward__decoder::DecoderStateCR.decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1IGptDecoderBatched_1ab71a988f92d801a763c8b7b960fd0769"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forward</span></span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoderE" title="tensorrt_llm::runtime::decoder"><span class="n"><span class="pre">decoder</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE" title="tensorrt_llm::runtime::decoder::DecoderState"><span class="n"><span class="pre">DecoderState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">decoderState</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE" title="tensorrt_llm::runtime::decoder_batch::Output"><span class="n"><span class="pre">Output</span></span></a><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">output</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE" title="tensorrt_llm::runtime::decoder_batch::Input"><span class="n"><span class="pre">Input</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">input</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<dd><p>Run one step for all requests and wait for completion on the host. </p>
</dd></dl>
@ -9992,46 +9991,30 @@ one more than decoding draft tokens for prediction from primary head </p>
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched9TensorPtrE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched9TensorPtrE"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a71918575432e49931d0452cfb4c98a8d"></span><span class="k"><span class="pre">using</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">TensorPtr</span></span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7ITensorE" title="tensorrt_llm::runtime::ITensor"><span class="n"><span class="pre">ITensor</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7ITensor9SharedPtrE" title="tensorrt_llm::runtime::ITensor::SharedPtr"><span class="n"><span class="pre">SharedPtr</span></span></a><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched9TensorPtrE" title="Link to this definition">#</a><br /></dt>
<dd></dd></dl>
<dl class="cpp type">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14SharedConstPtrE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched14SharedConstPtrE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched14SharedConstPtrE"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a2dd6a0a3bdccf9535c9df769033efe2e"></span><span class="k"><span class="pre">using</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">SharedConstPtr</span></span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7ITensorE" title="tensorrt_llm::runtime::ITensor"><span class="n"><span class="pre">ITensor</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7ITensor14SharedConstPtrE" title="tensorrt_llm::runtime::ITensor::SharedConstPtr"><span class="n"><span class="pre">SharedConstPtr</span></span></a><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14SharedConstPtrE" title="Link to this definition">#</a><br /></dt>
<dd></dd></dl>
</div>
<div class="breathe-sectiondef docutils container">
<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched__CudaStreamPtr.SpeculativeDecodingModeCR.nvinfer1::DataType"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1aca731fb55132a618ac1402370e6f55d6"></span><span class="sig-name descname"><span class="n"><span class="pre">GptDecoderBatched</span></span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13CudaStreamPtrE" title="tensorrt_llm::runtime::GptDecoderBatched::CudaStreamPtr"><span class="n"><span class="pre">CudaStreamPtr</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">stream</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingModeE" title="tensorrt_llm::runtime::SpeculativeDecodingMode"><span class="n"><span class="pre">SpeculativeDecodingMode</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">speculativeDecodingMode</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv48nvinfer1" title="nvinfer1"><span class="n"><span class="pre">nvinfer1</span></span></a><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">DataType</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">dtype</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE" title="Link to this definition">#</a><br /></dt>
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtr">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtr"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtr"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched__CudaStreamPtr"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a8629544d16a538ae9a46b0f23cccd7d3"></span><span class="k"><span class="pre">explicit</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">GptDecoderBatched</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13CudaStreamPtrE" title="tensorrt_llm::runtime::GptDecoderBatched::CudaStreamPtr"><span class="n"><span class="pre">CudaStreamPtr</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">stream</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtr" title="Link to this definition">#</a><br /></dt>
<dd></dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::setup__executor::DecodingModeCR.SizeType32.SizeType32.SizeType32.SizeType32.SizeType32.SizeType32.nvinfer1::DataType.ModelConfigCR.WorldConfigCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a4d3eccaa52123a48e1721d06f3e464a4"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">setup</span></span></span><span class="sig-paren">(</span>
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::setup__executor::DecodingModeCR.SizeType32.SizeType32.SizeType32.nvinfer1::DataType.ModelConfigCR.WorldConfigCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a8977d359344bba9f572e60c556b9a890"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">setup</span></span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><a class="reference internal" href="executor.html#_CPPv4N12tensorrt_llm8executorE" title="tensorrt_llm::executor"><span class="n"><span class="pre">executor</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="executor.html#_CPPv4N12tensorrt_llm8executor12DecodingModeE" title="tensorrt_llm::executor::DecodingMode"><span class="n"><span class="pre">DecodingMode</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">mode</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxBatchSize</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxBeamWidth</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxAttentionWindow</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">sinkTokenLength</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxSequenceLength</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">maxTokensPerStep</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv48nvinfer1" title="nvinfer1"><span class="n"><span class="pre">nvinfer1</span></span></a><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">DataType</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">dtype</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime11ModelConfigE" title="tensorrt_llm::runtime::ModelConfig"><span class="n"><span class="pre">ModelConfig</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">modelConfig</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime11WorldConfigE" title="tensorrt_llm::runtime::WorldConfig"><span class="n"><span class="pre">WorldConfig</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">worldConfig</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">override</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig" title="Link to this definition">#</a><br /></dt>
<dd><p>Setup the decoder before calling <code class="docutils literal notranslate"><a class="reference internal" href="#classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a918a04e07f32ab5367f8b3e82697d7e5"><span class="std std-ref"><span class="pre">forward()</span></span></a></code></p>
<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">override</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig" title="Link to this definition">#</a><br /></dt>
<dd><p>Setup the decoder before calling <code class="docutils literal notranslate"><a class="reference internal" href="#classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a41740e026890310d78a3ac98c22e3132"><span class="std std-ref"><span class="pre">forward()</span></span></a></code></p>
</dd></dl>
<dl class="cpp function">
@ -10048,28 +10031,30 @@ one more than decoding draft tokens for prediction from primary head </p>
</dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::forwardAsync__decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1ad043b3f39e5b6efac043b9ed90ed1b7e"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime9CudaEventE" title="tensorrt_llm::runtime::CudaEvent"><span class="n"><span class="pre">CudaEvent</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forwardAsync</span></span></span><span class="sig-paren">(</span>
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::forwardAsync__decoder::DecoderStateCR.decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a6f28f352026b3d0beb36947fb5706392"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime9CudaEventE" title="tensorrt_llm::runtime::CudaEvent"><span class="n"><span class="pre">CudaEvent</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forwardAsync</span></span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoderE" title="tensorrt_llm::runtime::decoder"><span class="n"><span class="pre">decoder</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE" title="tensorrt_llm::runtime::decoder::DecoderState"><span class="n"><span class="pre">DecoderState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">decoderState</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE" title="tensorrt_llm::runtime::decoder_batch::Output"><span class="n"><span class="pre">Output</span></span></a><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">output</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE" title="tensorrt_llm::runtime::decoder_batch::Input"><span class="n"><span class="pre">Input</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">input</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">override</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">override</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<dd><p>Run one step for all requests without blocking the host process and return the token for synchronization. </p>
</dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::forward__decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a918a04e07f32ab5367f8b3e82697d7e5"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forward</span></span></span><span class="sig-paren">(</span>
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::forward__decoder::DecoderStateCR.decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a41740e026890310d78a3ac98c22e3132"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forward</span></span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoderE" title="tensorrt_llm::runtime::decoder"><span class="n"><span class="pre">decoder</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE" title="tensorrt_llm::runtime::decoder::DecoderState"><span class="n"><span class="pre">DecoderState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">decoderState</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE" title="tensorrt_llm::runtime::decoder_batch::Output"><span class="n"><span class="pre">Output</span></span></a><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">output</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE" title="tensorrt_llm::runtime::decoder_batch::Input"><span class="n"><span class="pre">Input</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">input</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">override</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">override</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<dd><p>Run one step for all requests and wait for completion on the host. </p>
</dd></dl>
@ -10088,16 +10073,6 @@ one more than decoding draft tokens for prediction from primary head </p>
<dd><p>Gather final beam search results for request <code class="docutils literal notranslate"><span class="pre">batchSlot</span></code>. Result will only be available after event returned. </p>
</dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::getDecoderState"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a5b62ea90ff74aa90cee081ce7b40a2ac"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoderE" title="tensorrt_llm::runtime::decoder"><span class="n"><span class="pre">decoder</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE" title="tensorrt_llm::runtime::decoder::DecoderState"><span class="n"><span class="pre">DecoderState</span></span></a><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getDecoderState</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv" title="Link to this definition">#</a><br /></dt>
<dd></dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv">
<span id="_CPPv3NK12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv"></span><span id="_CPPv2NK12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::getDecoderStateC"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1aca6ea8e433f9ce18b066b7d1b3d4fa7d"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoderE" title="tensorrt_llm::runtime::decoder"><span class="n"><span class="pre">decoder</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE" title="tensorrt_llm::runtime::decoder::DecoderState"><span class="n"><span class="pre">DecoderState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getDecoderState</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv" title="Link to this definition">#</a><br /></dt>
<dd></dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getDecoderStreamEv">
<span id="_CPPv3NK12tensorrt_llm7runtime17GptDecoderBatched16getDecoderStreamEv"></span><span id="_CPPv2NK12tensorrt_llm7runtime17GptDecoderBatched16getDecoderStreamEv"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::getDecoderStreamC"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a00712a284f039faa4d900c53cceb7326"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13CudaStreamPtrE" title="tensorrt_llm::runtime::GptDecoderBatched::CudaStreamPtr"><span class="n"><span class="pre">CudaStreamPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getDecoderStream</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getDecoderStreamEv" title="Link to this definition">#</a><br /></dt>
@ -10125,44 +10100,19 @@ one more than decoding draft tokens for prediction from primary head </p>
<div class="breathe-sectiondef docutils container">
<p class="breathe-sectiondef-title rubric" id="breathe-section-title-private-functions">Private Functions</p>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::setExplicitDraftTokensInputs__decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1ab20d2145fd96701343c46c4c7d5177a6"></span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">setExplicitDraftTokensInputs</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE" title="tensorrt_llm::runtime::decoder_batch::Input"><span class="n"><span class="pre">Input</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">input</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<dd><p>Sets inputs for explicit draft tokens. </p>
</dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::setEagleInputs__decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a7e170cbad9f4ca2cd84eb3d828e81142"></span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">setEagleInputs</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE" title="tensorrt_llm::runtime::decoder_batch::Input"><span class="n"><span class="pre">Input</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">input</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<dd><p>Sets inputs for eagle decoding. </p>
</dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::forwardDispatch__decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1accb62cadbd48438f893a0cb99e7a9124"></span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forwardDispatch</span></span></span><span class="sig-paren">(</span>
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::forwardDispatch__decoder::DecoderStateCR.decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a6afa6ebdff09dba1bd53d47aa74e2967"></span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">forwardDispatch</span></span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoderE" title="tensorrt_llm::runtime::decoder"><span class="n"><span class="pre">decoder</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE" title="tensorrt_llm::runtime::decoder::DecoderState"><span class="n"><span class="pre">DecoderState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">decoderState</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE" title="tensorrt_llm::runtime::decoder_batch::Output"><span class="n"><span class="pre">Output</span></span></a><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">output</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE" title="tensorrt_llm::runtime::decoder_batch::Input"><span class="n"><span class="pre">Input</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">input</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<dd><p>Calls decoders for tokens per engine step. </p>
</dd></dl>
<dl class="cpp function">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::prepareForward__SizeType32.decoder_batch::OutputR.decoder_batch::InputCR"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a965f5f5bc9be5cda3e08674642360d19"></span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">prepareForward</span></span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">step</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE" title="tensorrt_llm::runtime::decoder_batch::Output"><span class="n"><span class="pre">Output</span></span></a><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">output</span></span></em>,</dd>
<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batchE" title="tensorrt_llm::runtime::decoder_batch"><span class="n"><span class="pre">decoder_batch</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE" title="tensorrt_llm::runtime::decoder_batch::Input"><span class="n"><span class="pre">Input</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">input</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE" title="Link to this definition">#</a><br /></dt>
<dd><p>Prepare Input and Output for decoder step. </p>
</dd></dl>
</div>
<div class="breathe-sectiondef docutils container">
<p class="breathe-sectiondef-title rubric" id="breathe-section-title-private-members">Private Members</p>
@ -10186,11 +10136,6 @@ one more than decoding draft tokens for prediction from primary head </p>
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched8mDecoderE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched8mDecoderE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::mDecoder__GptDecoderPtr"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1acc180102b6c64b88146e253d4070e495"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13GptDecoderPtrE" title="tensorrt_llm::runtime::GptDecoderBatched::GptDecoderPtr"><span class="n"><span class="pre">GptDecoderPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mDecoder</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched8mDecoderE" title="Link to this definition">#</a><br /></dt>
<dd></dd></dl>
<dl class="cpp var">
<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13mDecoderStateE">
<span id="_CPPv3N12tensorrt_llm7runtime17GptDecoderBatched13mDecoderStateE"></span><span id="_CPPv2N12tensorrt_llm7runtime17GptDecoderBatched13mDecoderStateE"></span><span id="tensorrt_llm::runtime::GptDecoderBatched::mDecoderState__std::shared_ptr:decoder::DecoderState:"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1GptDecoderBatched_1a693f55c08ae2ab8c333a0ba115df9f48"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">shared_ptr</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoderE" title="tensorrt_llm::runtime::decoder"><span class="n"><span class="pre">decoder</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE" title="tensorrt_llm::runtime::decoder::DecoderState"><span class="n"><span class="pre">DecoderState</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mDecoderState</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13mDecoderStateE" title="Link to this definition">#</a><br /></dt>
<dd></dd></dl>
</div>
</dd></dl>
@ -13316,10 +13261,10 @@ one more than decoding draft tokens for prediction from primary head </p>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13LlmRequestPtrE"><code class="docutils literal notranslate"><span class="pre">LlmRequestPtr</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13RequestVectorE"><code class="docutils literal notranslate"><span class="pre">RequestVector</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched9TensorPtrE"><code class="docutils literal notranslate"><span class="pre">TensorPtr</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"><code class="docutils literal notranslate"><span class="pre">setup()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"><code class="docutils literal notranslate"><span class="pre">setup()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr"><code class="docutils literal notranslate"><span class="pre">disableLookahead()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forwardAsync()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forward()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forwardAsync()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forward()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb"><code class="docutils literal notranslate"><span class="pre">finalize()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched18IGptDecoderBatchedEv"><code class="docutils literal notranslate"><span class="pre">IGptDecoderBatched()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatchedD0Ev"><code class="docutils literal notranslate"><span class="pre">~IGptDecoderBatched()</span></code></a></li>
@ -13384,28 +13329,21 @@ one more than decoding draft tokens for prediction from primary head </p>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13LlmRequestPtrE"><code class="docutils literal notranslate"><span class="pre">LlmRequestPtr</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13RequestVectorE"><code class="docutils literal notranslate"><span class="pre">RequestVector</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched9TensorPtrE"><code class="docutils literal notranslate"><span class="pre">TensorPtr</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14SharedConstPtrE"><code class="docutils literal notranslate"><span class="pre">SharedConstPtr</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE"><code class="docutils literal notranslate"><span class="pre">GptDecoderBatched()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"><code class="docutils literal notranslate"><span class="pre">setup()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtr"><code class="docutils literal notranslate"><span class="pre">GptDecoderBatched()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig"><code class="docutils literal notranslate"><span class="pre">setup()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr"><code class="docutils literal notranslate"><span class="pre">disableLookahead()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forwardAsync()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forward()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forwardAsync()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forward()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb"><code class="docutils literal notranslate"><span class="pre">finalize()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv"><code class="docutils literal notranslate"><span class="pre">getDecoderState()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv"><code class="docutils literal notranslate"><span class="pre">getDecoderState()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getDecoderStreamEv"><code class="docutils literal notranslate"><span class="pre">getDecoderStream()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched20getUnderlyingDecoderEv"><code class="docutils literal notranslate"><span class="pre">getUnderlyingDecoder()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getBufferManagerEv"><code class="docutils literal notranslate"><span class="pre">getBufferManager()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13GptDecoderPtrE"><code class="docutils literal notranslate"><span class="pre">GptDecoderPtr</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">setExplicitDraftTokensInputs()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">setEagleInputs()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forwardDispatch()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">prepareForward()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERKN7decoder12DecoderStateERN13decoder_batch6OutputERKN13decoder_batch5InputE"><code class="docutils literal notranslate"><span class="pre">forwardDispatch()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mRuntimeStreamE"><code class="docutils literal notranslate"><span class="pre">mRuntimeStream</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mDecoderStreamE"><code class="docutils literal notranslate"><span class="pre">mDecoderStream</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mBufferManagerE"><code class="docutils literal notranslate"><span class="pre">mBufferManager</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched8mDecoderE"><code class="docutils literal notranslate"><span class="pre">mDecoder</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13mDecoderStateE"><code class="docutils literal notranslate"><span class="pre">mDecoderState</span></code></a></li>
</ul>
</li>
</ul>
@ -13770,9 +13708,9 @@ one more than decoding draft tokens for prediction from primary head </p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -1,11 +1,11 @@
import math
import weakref
from enum import IntEnum
from typing import Optional, Union, cast
import torch
from torch import nn
from tensorrt_llm.logger import logger
from tensorrt_llm.mapping import Mapping
from ..attention_backend import (AttentionInputType, AttentionMetadata,
@ -23,15 +23,6 @@ from .rms_norm import RMSNorm
from .rotary_embedding import RotaryEmbedding
class QkNormType(IntEnum):
"""
The type of QK normalization.
"""
none = 0 # No normalization applied to Q and K
pre_rope = 1 # Apply normalization before Rope
post_rope = 2 # Apply normalization after Rope
class Attention(nn.Module):
def __init__(
@ -43,7 +34,7 @@ class Attention(nn.Module):
max_position_embeddings: int,
bias: bool,
pos_embd_params: Optional[PositionalEmbeddingParams] = None,
qk_norm_type: QkNormType = QkNormType.none,
rope_fusion: Optional[bool] = None,
layer_idx: Optional[int] = None,
dtype: torch.dtype = None,
dense_bias: Optional[bool] = None,
@ -60,14 +51,14 @@ class Attention(nn.Module):
num_key_value_heads (int): The number of key value heads.
max_position_embeddings (int): The maximum position embeddings.
bias (bool): Whether to use bias in the linear layers.
pos_embd_params (PositionalEmbeddingParams): The positional embedding parameters.
qk_norm_type (QkNormType): The type of QK normalization.
layer_idx (int): The layer index.
pos_embd_params (Optional[PositionalEmbeddingParams]): The positional embedding parameters.
rope_fusion (Optional[bool]): Whether to fuse RoPE into the attention OP and skip applying unfused RoPE. If None, whether to fuse is decided by the capability of the attention backend.
layer_idx (Optional[int]): The layer index.
dtype (torch.dtype): The data type.
dense_bias (bool): Whether to use bias in the output projection layer.
config (ModelConfig): The model configuration.
dense_bias (Optional[bool]): Whether to use bias in the output projection layer.
config (Optional[ModelConfig]): The model configuration.
q_scaling (float): The scaling factor for the qk_scale. The definition is $O = softmax(QK^T * qk_scale) * V, qk_scale = 1 / (sqrt(head_dim) * q_scaling)$. The default value is 1.0.
attention_chunk_size (int): See [Chunked Attention] below.
attention_chunk_size (Optional[int]): See [Chunked Attention] below.
"""
super().__init__()
self.layer_idx = layer_idx
@ -81,7 +72,6 @@ class Attention(nn.Module):
self.num_key_value_groups = self.num_heads // self.num_key_value_heads
self.max_position_embeddings = max_position_embeddings
self.pos_embd_params = pos_embd_params
self.qk_norm_type = qk_norm_type
self.dense_bias = dense_bias
self.q_scaling = q_scaling
@ -169,14 +159,21 @@ class Attention(nn.Module):
self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE],
[self.hidden_size])
# enable_rope_fusion: Whether to fuse RoPE into the attention OP.
# Whether to fuse RoPE into the attention OP.
# If true, RoPE will be applied in self.attn.forward.
# If false, RoPE will be applied in self.apply_rope.
self.enable_rope_fusion = attn_cls.support_fused_rope(
) and self.qk_norm_type != QkNormType.post_rope
self.rope_fusion = rope_fusion
if self.rope_fusion and not attn_cls.support_fused_rope():
logger.warning(
"rope_fusion is true but the attention backend does not support it. Will disable rope_fusion."
)
self.rope_fusion = False
# If rope_fusion is not specified, enable if the attention backend supports it.
if self.rope_fusion is None:
self.rope_fusion = attn_cls.support_fused_rope()
self.rotary_emb = None
if not self.enable_rope_fusion and self.pos_embd_params is not None:
if not self.rope_fusion and self.pos_embd_params is not None:
self.rotary_emb = RotaryEmbedding(
self.pos_embd_params.rope,
head_dim=self.head_dim,
@ -189,8 +186,7 @@ class Attention(nn.Module):
self.num_heads,
self.head_dim,
self.num_key_value_heads,
pos_embd_params=self.pos_embd_params
if self.enable_rope_fusion else None,
pos_embd_params=self.pos_embd_params if self.rope_fusion else None,
quant_config=self.quant_config,
skip_create_weights_in_init=config.skip_create_weights_in_init,
q_scaling=self.q_scaling,
@ -198,6 +194,7 @@ class Attention(nn.Module):
)
self.support_fused_qkv = self.attn.support_fused_qkv()
self.support_nvfp4_output = self.attn.support_nvfp4_output()
if not config.skip_create_weights_in_init:
self.create_weights()
@ -222,7 +219,7 @@ class Attention(nn.Module):
def forward(
self,
position_ids: Optional[torch.LongTensor],
position_ids: Optional[torch.IntTensor],
hidden_states: Union[torch.Tensor, Fp4QuantizedTensor],
attn_metadata: AttentionMetadata,
attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.
@ -237,7 +234,7 @@ class Attention(nn.Module):
Forward pass for the Attention module.
Args:
position_ids (Optional[torch.LongTensor]): The position IDs.
position_ids (Optional[torch.IntTensor]): The position IDs.
hidden_states (torch.Tensor): The hidden states.
attn_metadata (AttentionMetadata): The attention metadata.
attention_mask (PredefinedAttentionMask): The attention mask type.
@ -262,11 +259,16 @@ class Attention(nn.Module):
if qkv_lora is not None:
qkv = qkv + qkv_lora
q, k, v = self.apply_rope(qkv, position_ids)
q, k, v = qkv, None, None
q, k, v = self.apply_rope(q, k, v, position_ids)
out_scale = None
out_scale_sf = None
if self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales:
out_scale = self.o_proj.inv_input_scale
if self.o_proj.has_nvfp4 and self.support_nvfp4_output:
out_scale_sf = self.o_proj.input_scale
q, k, v = self.convert_qkv(q, k, v)
attn_output = self.attn.forward(
@ -275,6 +277,7 @@ class Attention(nn.Module):
v,
attn_metadata,
out_scale=out_scale,
out_scale_sf=out_scale_sf,
attention_mask=attention_mask,
mrope_config=mrope_config,
attention_window_size=attention_window_size)
@ -285,32 +288,25 @@ class Attention(nn.Module):
layer_idx=self.layer_idx)
return attn_output
def apply_qk_norm(self, q, k):
raise NotImplementedError(
f"QK norm is not implemented for {self.__class__.__name__}."
"Please override the `apply_qk_norm` method in the subclass.")
def apply_rope(self, qkv: torch.Tensor, position_ids: torch.Tensor):
def apply_rope(self, q: torch.Tensor, k: Optional[torch.Tensor],
v: Optional[torch.Tensor], position_ids: torch.Tensor):
"""
Apply RoPE to the query and key, possibly including QK norm.
Apply RoPE to the query and key.
Depending on the implementation, q, k, v could be either fused (q, k, v = concat(q, k, v), None, None) or unfused (none of q, k, v is None).
Before self.attn.forward, convert_qkv will be called to make sure that the format of (q, k, v) satisfies the requirement of self.attn.
This method could be overridden in the subclass, in which extra functionalities such as q_norm/k_norm could be added.
Args:
qkv (torch.Tensor): The query, key, and value tensor.
q (torch.Tensor): The query tensor.
k (Optional[torch.Tensor]): The key tensor.
v (Optional[torch.Tensor]): The value tensor.
position_ids (torch.Tensor): The position IDs of each token for RoPE.
Returns:
tuple: A tuple of (q, k, v).
This method could be overridden in the subclass, it is possible that k/v is None and q is the concatenated qkv tensor, up to the implementation.
Before self.attn.forward, convert_qkv will be called to make sure that the format of (q, k, v) satisfies the requirement of self.attn.
"""
q, k, v = qkv, None, None
if self.qk_norm_type == QkNormType.pre_rope:
q, k, v = self.split_qkv(q, k, v)
q, k = self.apply_qk_norm(q, k)
if not self.enable_rope_fusion and position_ids is not None:
q, k, v = self.split_qkv(q, k, v)
q, k, v = self.split_qkv(q, k, v)
# If RoPE is fused into the attention OP, do not apply RoPE here.
if not self.rope_fusion and position_ids is not None:
q, k = self.rotary_emb(position_ids, [q, k])
if self.qk_norm_type == QkNormType.post_rope:
q, k = self.apply_qk_norm(q, k)
return q, k, v
@ -595,14 +591,14 @@ class MLA(nn.Module):
self.aux_stream = aux_stream
self.ln_events = [torch.cuda.Event(), torch.cuda.Event()]
self.enable_rope_fusion = self.mha.support_fused_rope()
self.rope_fusion = self.mha.support_fused_rope()
self.support_fused_qkv = self.mha.support_fused_qkv()
self.rotary_emb = RotaryEmbedding(
pos_embd_params.rope,
head_dim=self.qk_rope_head_dim,
is_neox=pos_embd_params.is_neox,
)
self.apply_rotary_emb = not self.enable_rope_fusion
self.apply_rotary_emb = not self.rope_fusion
if not config.skip_create_weights_in_init:
self.create_weights()
@ -687,7 +683,7 @@ class MLA(nn.Module):
Forward pass for the MLA module.
Args:
position_ids (Optional[torch.LongTensor]): The position IDs.
position_ids (Optional[torch.IntTensor]): The position IDs.
hidden_states (torch.Tensor): The hidden states.
attn_metadata (AttentionMetadata): The attention metadata.
all_reduce_params (Optional[AllReduceParams]): The all reduce parameters.
@ -841,7 +837,7 @@ class MLA(nn.Module):
compressed_kv: torch.Tensor,
k_pe: torch.Tensor,
attn_metadata: AttentionMetadata,
position_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.IntTensor] = None,
) -> torch.Tensor:
trtllm_attention = cast(TrtllmAttention, self.mha)
# split current q into q_nope and q_pe
@ -949,7 +945,7 @@ class MLA(nn.Module):
k_pe: torch.Tensor,
attn_metadata: AttentionMetadata,
latent_cache: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.IntTensor] = None,
) -> torch.Tensor:
if isinstance(self.mha, TrtllmAttention):
assert isinstance(attn_metadata, TrtllmAttentionMetadata)

View File

@ -42,7 +42,6 @@ from ..bindings.executor import (
PeftCacheConfig as _PeftCacheConfig,
SchedulerConfig as _SchedulerConfig) # isort: skip
# isort: on
from transformers import PreTrainedTokenizerBase
# yapf: enable
from ..builder import BuildConfig, EngineConfig
@ -1087,7 +1086,7 @@ class BaseLlmArgs(BaseModel):
self.speculative_model
) if self.speculative_model is not None else None
if model_obj.is_local_model and self.backend not in [
'pytorch', 'autodeploy'
'pytorch', '_autodeploy'
]:
# Load parallel_config from the engine.
self.model_format = get_model_format(self.model)
@ -1191,7 +1190,7 @@ class BaseLlmArgs(BaseModel):
self.build_config.max_draft_len = self.speculative_config.max_draft_len
if self.backend != 'pytorch':
if self.backend not in ['pytorch', '_autodeploy']:
eagle_config = _EagleConfig(
self.speculative_config.eagle_choices,
self.speculative_config.greedy_sampling,
@ -1211,7 +1210,7 @@ class BaseLlmArgs(BaseModel):
eagle3_one_model)
elif isinstance(self.speculative_config, NGramDecodingConfig):
self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.NGRAM
assert self.backend == 'pytorch'
assert self.backend in ['pytorch', '_autodeploy']
assert self.speculative_config.prompt_lookup_num_tokens > 0 and self.speculative_config.max_matching_ngram_size > 0
self.build_config.max_draft_len = self.speculative_config.max_draft_len
from tensorrt_llm._torch.speculative import NGramConfig
@ -1259,9 +1258,11 @@ class BaseLlmArgs(BaseModel):
"lora_dir is empty, so custom embedding or lm head will not be applied."
)
if self.enable_lora and self.lora_config is not None and self.backend == 'pytorch':
if self.enable_lora and self.lora_config is not None and self.backend in [
'pytorch', '_autodeploy'
]:
logger.warning(
"enable_lora is ignored when lora_config is provided for pytorch backend."
f"enable_lora is ignored when lora_config is provided for {self.backend} backend."
)
if self.lora_config is not None:
@ -1634,11 +1635,6 @@ class TorchLlmArgs(BaseLlmArgs):
def get_pytorch_backend_config(self) -> "PyTorchConfig":
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
# TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
# Just a WAR to support the auto_deploy
if self.auto_deploy_config is not None:
return self.auto_deploy_config
return PyTorchConfig(
extra_resource_managers=self.extra_resource_managers,
use_cuda_graph=self.use_cuda_graph,
@ -1718,7 +1714,7 @@ class TorchLlmArgs(BaseLlmArgs):
2. If cuda_graph_batch_sizes is not provided, it is generated based on cuda_graph_max_batch_size
3. If both are provided, cuda_graph_batch_sizes must match the generated values
"""
if self.cuda_graph_batch_sizes is not None:
if self.cuda_graph_batch_sizes:
self.cuda_graph_batch_sizes = sorted(self.cuda_graph_batch_sizes)
if self.cuda_graph_max_batch_size != 0:
if self.cuda_graph_batch_sizes != self._generate_cuda_graph_batch_sizes(
@ -1743,6 +1739,109 @@ class TorchLlmArgs(BaseLlmArgs):
return self
class _AutoDeployLlmArgs(TorchLlmArgs):
"""LLM arguments specifically for AutoDeploy backend.
This class extends TorchLlmArgs with AutoDeploy-specific configuration options.
AutoDeploy provides automatic deployment and optimization of language models
with various attention backends and optimization strategies.
"""
model_factory: Literal[
"AutoModelForCausalLM", "AutoModelForImageTextToText"] = Field(
default="AutoModelForCausalLM",
description="The model factory to use for loading the model.",
)
model_kwargs: Dict[str, Any] = Field(
default_factory=dict,
description=
"Extra kwargs for the model config class to customize the model config. "
"These arguments take precedence over default values or config values in the model config "
"file. Arguments are resolved in order: 1) Default values in model config class, 2) Values "
"in model config file, 3) Values in model_kwargs. Note: if a kwarg doesn't exist in the "
"model config class, it will be ignored.",
)
mla_backend: Literal["MultiHeadLatentAttention"] = Field(
default="MultiHeadLatentAttention",
description="The Multi-Head Latent Attention backend to use.",
)
skip_loading_weights: bool = Field(
default=False,
description=
"Whether to skip loading model weights during initialization. "
"If True, only the model architecture is loaded.",
)
free_mem_ratio: float = Field(
default=0.8,
description="The fraction of available memory to allocate for cache. "
"Must be between 0.0 and 1.0.",
)
simple_shard_only: bool = Field(
default=False,
description=
"If True, force simple sharding (all_gather) in tensor parallelism. "
"If False, auto-detect and use column+row (all_reduce) sharding when possible.",
)
# TODO: Remove this field once tokens_per_block is properly passed through
attn_page_size: int = Field(
default=64,
description=
"Page size for attention (tokens_per_block). For TritonWithFlattenedInputs "
"backend, this should equal max_seq_len. Temporary field until tokens_per_block gets "
"properly passed through.",
)
@field_validator("free_mem_ratio")
@classmethod
def validate_free_mem_ratio(cls, v):
"""Validate that free_mem_ratio is between 0.0 and 1.0."""
if not 0.0 <= v <= 1.0:
raise ValueError(
f"free_mem_ratio must be between 0.0 and 1.0, got {v}")
return v
@print_traceback_on_error
def model_post_init(self, __context):
# Modify default values that differ from TorchLlmArgs
new_defaults = {
"max_batch_size": 8,
"max_seq_len": 512,
"attn_backend": "FlashInfer",
# TODO: Remove this when overlap scheduler is supported (https://github.com/NVIDIA/TensorRT-LLM/issues/4364)
"disable_overlap_scheduler": True,
}
for k, v_default in new_defaults.items():
if k not in self.__pydantic_fields_set__:
setattr(self, k, v_default)
# NOTE: Only call super() after setting the default values since default values should be
# set first.
super().model_post_init(__context)
# Handle attn_page_size for TritonWithFlattenedInputs backend
if self.attn_backend == "TritonWithFlattenedInputs":
self.attn_page_size = self.max_seq_len
# Add max_position_embeddings to model_kwargs
# TODO (lucaslie): this is more HF specific than a generic model_kwargs. Ideally, we can
# move this to the HF model factory but we don't have access to max_seq_len there right now.
self.model_kwargs["max_position_embeddings"] = min(
self.max_seq_len,
self.model_kwargs.get("max_position_embeddings", self.max_seq_len),
)
# TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
def get_pytorch_backend_config(self) -> "_AutoDeployLlmArgs":
"""Return the _AutoDeployLlmArgs (self) object."""
return self
def update_llm_args_with_extra_dict(
llm_args: Dict,
llm_args_dict: Dict,

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -687,9 +687,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1993,9 +1993,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -668,9 +668,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -721,10 +721,6 @@
<span class="k">else</span><span class="p">:</span>
<span class="n">output</span><span class="o">.</span><span class="n">token_ids</span><span class="o">.</span><span class="n">extend</span><span class="p">(</span><span class="n">response_tensors</span><span class="o">.</span><span class="n">output_token_ids</span><span class="p">[</span><span class="n">src_idx</span><span class="p">])</span>
<span class="c1"># In PD, the first token should be ignored in streaming mode, since it&#39;s already been returned by the context server</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">disaggregated_params</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">disaggregated_params</span><span class="o">.</span><span class="n">request_type</span> <span class="o">==</span> <span class="s2">&quot;generation_only&quot;</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_streaming</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">decoding_iter</span> <span class="o">==</span> <span class="mi">2</span><span class="p">:</span>
<span class="n">output</span><span class="o">.</span><span class="n">_last_token_ids_len</span> <span class="o">=</span> <span class="mi">1</span>
<span class="k">if</span> <span class="n">response_tensors</span><span class="o">.</span><span class="n">cum_log_probs</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">output</span><span class="o">.</span><span class="n">cumulative_logprob</span> <span class="o">=</span> <span class="n">response_tensors</span><span class="o">.</span><span class="n">cum_log_probs</span><span class="p">[</span><span class="n">src_idx</span><span class="p">]</span>
@ -1273,9 +1269,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -519,7 +519,6 @@
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">mpi_rank</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Response</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">print_colored_debug</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">..llmapi.mpi_session</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">MpiCommSession</span><span class="p">,</span> <span class="n">MpiPoolSession</span><span class="p">,</span> <span class="n">MpiSession</span><span class="p">,</span>
<span class="n">RemoteMpiCommSessionClient</span><span class="p">)</span>
@ -536,10 +535,6 @@
<span class="n">TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT</span> <span class="o">=</span> <span class="s2">&quot;TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT&quot;</span>
<span class="n">PERIODICAL_RESP_IN_AWAIT</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span>
<span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT</span><span class="p">)</span> <span class="o">==</span> <span class="s2">&quot;1&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="nf">get_spawn_proxy_process_ipc_addr_env</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&#39;&#39;&#39; Get the IPC address for the spawn proxy process dynamically. &#39;&#39;&#39;</span>
<span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR</span><span class="p">)</span>
@ -556,10 +551,6 @@
<span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_SPAWN_PROXY_PROCESS</span><span class="p">)</span> <span class="o">==</span> <span class="s2">&quot;1&quot;</span>
<span class="k">if</span> <span class="n">PERIODICAL_RESP_IN_AWAIT</span><span class="p">:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s2">&quot;Using periodical responses in await_responses&quot;</span><span class="p">)</span>
<span class="k">def</span><span class="w"> </span><span class="nf">create_mpi_comm_session</span><span class="p">(</span>
<span class="n">n_workers</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">RemoteMpiCommSessionClient</span> <span class="o">|</span> <span class="n">MpiPoolSession</span><span class="p">:</span>
<span class="k">assert</span> <span class="n">mpi_rank</span><span class="p">(</span>
@ -658,7 +649,7 @@
<span class="k">class</span><span class="w"> </span><span class="nc">WorkerCommIpcAddrs</span><span class="p">(</span><span class="n">NamedTuple</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&#39;&#39;&#39; IPC addresses (str) and HMAC keys (bytes) for communication with the worker processes. &#39;&#39;&#39;</span>
<span class="n">request_queue_addr</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bytes</span><span class="p">]]</span>
<span class="n">request_error_queue_addr</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bytes</span><span class="p">]]</span>
<span class="n">worker_init_status_queue_addr</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bytes</span><span class="p">]]</span>
<span class="n">result_queue_addr</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bytes</span><span class="p">]]</span>
<span class="n">stats_queue_addr</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bytes</span><span class="p">]]</span>
<span class="n">kv_cache_events_queue_addr</span><span class="p">:</span> <span class="nb">tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bytes</span><span class="p">]]</span>
@ -781,9 +772,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -8681,9 +8681,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -646,9 +646,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -3511,9 +3511,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -653,9 +653,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -902,9 +902,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1369,9 +1369,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1217,9 +1217,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1243,9 +1243,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1007,9 +1007,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -662,9 +662,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -946,9 +946,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -511,7 +511,9 @@
<span class="kn">import</span><span class="w"> </span><span class="nn">json</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">shutil</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">socket</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">tempfile</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">time</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">weakref</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Literal</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">,</span> <span class="n">Union</span>
@ -538,7 +540,7 @@
<span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">..sampling_params</span><span class="w"> </span><span class="kn">import</span> <span class="n">SamplingParams</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_args</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span> <span class="n">PybindMirror</span><span class="p">,</span> <span class="n">TorchLlmArgs</span><span class="p">,</span>
<span class="n">TrtLlmArgs</span><span class="p">)</span>
<span class="n">TrtLlmArgs</span><span class="p">,</span> <span class="n">_AutoDeployLlmArgs</span><span class="p">)</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_utils</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">CachedModelLoader</span><span class="p">,</span> <span class="n">KvCacheRetentionConfig</span><span class="p">,</span>
<span class="n">LlmBuildStats</span><span class="p">,</span> <span class="n">ModelLoader</span><span class="p">,</span> <span class="n">_ModelRuntimeContext</span><span class="p">)</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">.mpi_session</span><span class="w"> </span><span class="kn">import</span> <span class="n">MpiPoolSession</span><span class="p">,</span> <span class="n">external_mpi_comm_available</span>
@ -601,6 +603,7 @@
<span class="s2"> Attributes:</span>
<span class="s2"> tokenizer (tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional): The tokenizer loaded by LLM instance, if any.</span>
<span class="s2"> workspace (pathlib.Path): The directory to store intermediate files.</span>
<span class="s2"> llm_id (str): The unique ID of the LLM instance.</span>
<span class="s2">&quot;&quot;&quot;</span>
@ -629,10 +632,16 @@
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_executor_cls</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s2">&quot;executor_cls&quot;</span><span class="p">,</span> <span class="n">GenerationExecutor</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_llm_id</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">try</span><span class="p">:</span>
<span class="n">llm_args_cls</span> <span class="o">=</span> <span class="n">TorchLlmArgs</span> <span class="k">if</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
<span class="s1">&#39;backend&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> <span class="o">==</span> <span class="s1">&#39;pytorch&#39;</span> <span class="k">else</span> <span class="n">TrtLlmArgs</span>
<span class="n">backend</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;backend&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="k">if</span> <span class="n">backend</span> <span class="o">==</span> <span class="s1">&#39;pytorch&#39;</span><span class="p">:</span>
<span class="n">llm_args_cls</span> <span class="o">=</span> <span class="n">TorchLlmArgs</span>
<span class="k">elif</span> <span class="n">backend</span> <span class="o">==</span> <span class="s1">&#39;_autodeploy&#39;</span><span class="p">:</span>
<span class="n">llm_args_cls</span> <span class="o">=</span> <span class="n">_AutoDeployLlmArgs</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">llm_args_cls</span> <span class="o">=</span> <span class="n">TrtLlmArgs</span>
<span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">llm_args_cls</span><span class="o">.</span><span class="n">from_kwargs</span><span class="p">(</span>
<span class="n">model</span><span class="o">=</span><span class="n">model</span><span class="p">,</span>
@ -706,6 +715,16 @@
<span class="k">def</span><span class="w"> </span><span class="nf">workspace</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Path</span><span class="p">:</span>
<span class="k">return</span> <span class="n">Path</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="kc">None</span>
<span class="nd">@property</span>
<span class="k">def</span><span class="w"> </span><span class="nf">llm_id</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_llm_id</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">hostname</span> <span class="o">=</span> <span class="n">socket</span><span class="o">.</span><span class="n">gethostname</span><span class="p">()</span>
<span class="n">pid</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">getpid</span><span class="p">()</span>
<span class="n">timestamp</span> <span class="o">=</span> <span class="nb">int</span><span class="p">(</span><span class="n">time</span><span class="o">.</span><span class="n">time</span><span class="p">()</span> <span class="o">*</span> <span class="mi">1000</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_llm_id</span> <span class="o">=</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">hostname</span><span class="si">}</span><span class="s2">-</span><span class="si">{</span><span class="n">pid</span><span class="si">}</span><span class="s2">-</span><span class="si">{</span><span class="n">timestamp</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_llm_id</span>
<div class="viewcode-block" id="LLM.generate">
<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.generate">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">generate</span><span class="p">(</span>
@ -989,7 +1008,7 @@
<span class="p">)</span>
<span class="n">sampling_params</span><span class="o">.</span><span class="n">_setup</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">)</span>
<span class="c1"># auto enabled context and/or generation logits flags, as they are required by logprob computation for TRT backend.</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;pytorch&quot;</span><span class="p">,</span> <span class="s2">&quot;autodeploy&quot;</span><span class="p">]:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;pytorch&quot;</span><span class="p">,</span> <span class="s2">&quot;_autodeploy&quot;</span><span class="p">]:</span>
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">return_context_logits</span><span class="p">:</span>
<span class="n">sampling_params</span><span class="o">.</span><span class="n">return_context_logits</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">sampling_params</span><span class="o">.</span><span class="n">_context_logits_auto_enabled</span> <span class="o">=</span> <span class="kc">True</span>
@ -1006,7 +1025,7 @@
<span class="k">def</span><span class="w"> </span><span class="nf">_check_arguments</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">prompt_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">query_len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">sampling_params</span><span class="p">:</span> <span class="n">SamplingParams</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;pytorch&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;pytorch&quot;</span><span class="p">,</span> <span class="s2">&quot;_autodeploy&quot;</span><span class="p">]:</span>
<span class="c1"># TODO: remove these checks after PyTorch backend</span>
<span class="c1"># fully support TopK prompt and generation logprobs.</span>
<span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">prompt_logprobs</span><span class="p">:</span>
@ -1018,7 +1037,7 @@
<span class="sa">f</span><span class="s2">&quot;PyTorch backend currently only supports `logprobs=1`. Received `logprobs=</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">logprobs</span><span class="si">}</span><span class="s2">` (Top</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">logprobs</span><span class="si">}</span><span class="s2"> logprobs). Please set `logprobs=1` in `sampling_params` instead.&quot;</span>
<span class="p">)</span>
<span class="k">return</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;autodeploy&quot;</span><span class="p">:</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;_autodeploy&quot;</span><span class="p">:</span>
<span class="k">return</span>
<span class="n">build_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
@ -1171,7 +1190,7 @@
<span class="n">executor_config</span><span class="p">,</span>
<span class="n">backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span><span class="p">,</span>
<span class="n">pytorch_backend_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">get_pytorch_backend_config</span><span class="p">()</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;pytorch&quot;</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;pytorch&quot;</span><span class="p">,</span> <span class="s2">&quot;_autodeploy&quot;</span><span class="p">]</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">mapping</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">to_mapping</span><span class="p">(),</span>
<span class="n">build_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
@ -1218,9 +1237,9 @@
<span class="c1"># TODO smor- need to refine what is the desired behavior if lora is enabled</span>
<span class="c1"># in terms of the tokenizer initialization process</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">,</span> <span class="s2">&quot;backend&quot;</span>
<span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;pytorch&quot;</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">,</span> <span class="s2">&quot;backend&quot;</span><span class="p">)</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span>
<span class="s2">&quot;pytorch&quot;</span><span class="p">,</span> <span class="s2">&quot;_autodeploy&quot;</span>
<span class="p">]</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">num_lora_dirs</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">lora_dir</span><span class="p">)</span>
<span class="k">if</span> <span class="n">num_lora_dirs</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">tokenizer_path</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">lora_dir</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
@ -1427,9 +1446,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -551,7 +551,6 @@
<span class="n">PeftCacheConfig</span> <span class="k">as</span> <span class="n">_PeftCacheConfig</span><span class="p">,</span>
<span class="n">SchedulerConfig</span> <span class="k">as</span> <span class="n">_SchedulerConfig</span><span class="p">)</span> <span class="c1"># isort: skip</span>
<span class="c1"># isort: on</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">PreTrainedTokenizerBase</span>
<span class="c1"># yapf: enable</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">..builder</span><span class="w"> </span><span class="kn">import</span> <span class="n">BuildConfig</span><span class="p">,</span> <span class="n">EngineConfig</span>
@ -1668,7 +1667,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_model</span>
<span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_model</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="k">else</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">model_obj</span><span class="o">.</span><span class="n">is_local_model</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span>
<span class="s1">&#39;pytorch&#39;</span><span class="p">,</span> <span class="s1">&#39;autodeploy&#39;</span>
<span class="s1">&#39;pytorch&#39;</span><span class="p">,</span> <span class="s1">&#39;_autodeploy&#39;</span>
<span class="p">]:</span>
<span class="c1"># Load parallel_config from the engine.</span>
<span class="bp">self</span><span class="o">.</span><span class="n">model_format</span> <span class="o">=</span> <span class="n">get_model_format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">)</span>
@ -1772,7 +1771,7 @@
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">!=</span> <span class="s1">&#39;pytorch&#39;</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">&#39;pytorch&#39;</span><span class="p">,</span> <span class="s1">&#39;_autodeploy&#39;</span><span class="p">]:</span>
<span class="n">eagle_config</span> <span class="o">=</span> <span class="n">_EagleConfig</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">eagle_choices</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">greedy_sampling</span><span class="p">,</span>
@ -1792,7 +1791,7 @@
<span class="n">eagle3_one_model</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NGRAM</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s1">&#39;pytorch&#39;</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span><span class="s1">&#39;pytorch&#39;</span><span class="p">,</span> <span class="s1">&#39;_autodeploy&#39;</span><span class="p">]</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">prompt_lookup_num_tokens</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_matching_ngram_size</span> <span class="o">&gt;</span> <span class="mi">0</span>
<span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">NGramConfig</span>
@ -1840,9 +1839,11 @@
<span class="s2">&quot;lora_dir is empty, so custom embedding or lm head will not be applied.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_lora</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s1">&#39;pytorch&#39;</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_lora</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="ow">in</span> <span class="p">[</span>
<span class="s1">&#39;pytorch&#39;</span><span class="p">,</span> <span class="s1">&#39;_autodeploy&#39;</span>
<span class="p">]:</span>
<span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
<span class="s2">&quot;enable_lora is ignored when lora_config is provided for pytorch backend.&quot;</span>
<span class="sa">f</span><span class="s2">&quot;enable_lora is ignored when lora_config is provided for </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">backend</span><span class="si">}</span><span class="s2"> backend.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">lora_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
@ -2231,11 +2232,6 @@
<span class="k">def</span><span class="w"> </span><span class="nf">get_pytorch_backend_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PyTorchConfig&quot;</span><span class="p">:</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.pyexecutor.config</span><span class="w"> </span><span class="kn">import</span> <span class="n">PyTorchConfig</span>
<span class="c1"># TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig</span>
<span class="c1"># Just a WAR to support the auto_deploy</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_deploy_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_deploy_config</span>
<span class="k">return</span> <span class="n">PyTorchConfig</span><span class="p">(</span>
<span class="n">extra_resource_managers</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">extra_resource_managers</span><span class="p">,</span>
<span class="n">use_cuda_graph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">use_cuda_graph</span><span class="p">,</span>
@ -2321,7 +2317,7 @@
<span class="sd"> 2. If cuda_graph_batch_sizes is not provided, it is generated based on cuda_graph_max_batch_size</span>
<span class="sd"> 3. If both are provided, cuda_graph_batch_sizes must match the generated values</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate_cuda_graph_batch_sizes</span><span class="p">(</span>
@ -2348,6 +2344,109 @@
<span class="k">class</span><span class="w"> </span><span class="nc">_AutoDeployLlmArgs</span><span class="p">(</span><span class="n">TorchLlmArgs</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;LLM arguments specifically for AutoDeploy backend.</span>
<span class="sd"> This class extends TorchLlmArgs with AutoDeploy-specific configuration options.</span>
<span class="sd"> AutoDeploy provides automatic deployment and optimization of language models</span>
<span class="sd"> with various attention backends and optimization strategies.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">model_factory</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span>
<span class="s2">&quot;AutoModelForCausalLM&quot;</span><span class="p">,</span> <span class="s2">&quot;AutoModelForImageTextToText&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="s2">&quot;AutoModelForCausalLM&quot;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;The model factory to use for loading the model.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">model_kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default_factory</span><span class="o">=</span><span class="nb">dict</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;Extra kwargs for the model config class to customize the model config. &quot;</span>
<span class="s2">&quot;These arguments take precedence over default values or config values in the model config &quot;</span>
<span class="s2">&quot;file. Arguments are resolved in order: 1) Default values in model config class, 2) Values &quot;</span>
<span class="s2">&quot;in model config file, 3) Values in model_kwargs. Note: if a kwarg doesn&#39;t exist in the &quot;</span>
<span class="s2">&quot;model config class, it will be ignored.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">mla_backend</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s2">&quot;MultiHeadLatentAttention&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="s2">&quot;MultiHeadLatentAttention&quot;</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;The Multi-Head Latent Attention backend to use.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">skip_loading_weights</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;Whether to skip loading model weights during initialization. &quot;</span>
<span class="s2">&quot;If True, only the model architecture is loaded.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">free_mem_ratio</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="mf">0.8</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span><span class="s2">&quot;The fraction of available memory to allocate for cache. &quot;</span>
<span class="s2">&quot;Must be between 0.0 and 1.0.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">simple_shard_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;If True, force simple sharding (all_gather) in tensor parallelism. &quot;</span>
<span class="s2">&quot;If False, auto-detect and use column+row (all_reduce) sharding when possible.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># TODO: Remove this field once tokens_per_block is properly passed through</span>
<span class="n">attn_page_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
<span class="n">default</span><span class="o">=</span><span class="mi">64</span><span class="p">,</span>
<span class="n">description</span><span class="o">=</span>
<span class="s2">&quot;Page size for attention (tokens_per_block). For TritonWithFlattenedInputs &quot;</span>
<span class="s2">&quot;backend, this should equal max_seq_len. Temporary field until tokens_per_block gets &quot;</span>
<span class="s2">&quot;properly passed through.&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@field_validator</span><span class="p">(</span><span class="s2">&quot;free_mem_ratio&quot;</span><span class="p">)</span>
<span class="nd">@classmethod</span>
<span class="k">def</span><span class="w"> </span><span class="nf">validate_free_mem_ratio</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Validate that free_mem_ratio is between 0.0 and 1.0.&quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="mf">0.0</span> <span class="o">&lt;=</span> <span class="n">v</span> <span class="o">&lt;=</span> <span class="mf">1.0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;free_mem_ratio must be between 0.0 and 1.0, got </span><span class="si">{</span><span class="n">v</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="n">v</span>
<span class="nd">@print_traceback_on_error</span>
<span class="k">def</span><span class="w"> </span><span class="nf">model_post_init</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__context</span><span class="p">):</span>
<span class="c1"># Modify default values that differ from TorchLlmArgs</span>
<span class="n">new_defaults</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;max_batch_size&quot;</span><span class="p">:</span> <span class="mi">8</span><span class="p">,</span>
<span class="s2">&quot;max_seq_len&quot;</span><span class="p">:</span> <span class="mi">512</span><span class="p">,</span>
<span class="s2">&quot;attn_backend&quot;</span><span class="p">:</span> <span class="s2">&quot;FlashInfer&quot;</span><span class="p">,</span>
<span class="c1"># TODO: Remove this when overlap scheduler is supported (https://github.com/NVIDIA/TensorRT-LLM/issues/4364)</span>
<span class="s2">&quot;disable_overlap_scheduler&quot;</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">}</span>
<span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v_default</span> <span class="ow">in</span> <span class="n">new_defaults</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="k">if</span> <span class="n">k</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">__pydantic_fields_set__</span><span class="p">:</span>
<span class="nb">setattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">k</span><span class="p">,</span> <span class="n">v_default</span><span class="p">)</span>
<span class="c1"># NOTE: Only call super() after setting the default values since default values should be</span>
<span class="c1"># set first.</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">model_post_init</span><span class="p">(</span><span class="n">__context</span><span class="p">)</span>
<span class="c1"># Handle attn_page_size for TritonWithFlattenedInputs backend</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">attn_backend</span> <span class="o">==</span> <span class="s2">&quot;TritonWithFlattenedInputs&quot;</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">attn_page_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span>
<span class="c1"># Add max_position_embeddings to model_kwargs</span>
<span class="c1"># TODO (lucaslie): this is more HF specific than a generic model_kwargs. Ideally, we can</span>
<span class="c1"># move this to the HF model factory but we don&#39;t have access to max_seq_len there right now.</span>
<span class="bp">self</span><span class="o">.</span><span class="n">model_kwargs</span><span class="p">[</span><span class="s2">&quot;max_position_embeddings&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">model_kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;max_position_embeddings&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_seq_len</span><span class="p">),</span>
<span class="p">)</span>
<span class="c1"># TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig</span>
<span class="k">def</span><span class="w"> </span><span class="nf">get_pytorch_backend_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;_AutoDeployLlmArgs&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return the _AutoDeployLlmArgs (self) object.&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">def</span><span class="w"> </span><span class="nf">update_llm_args_with_extra_dict</span><span class="p">(</span>
<span class="n">llm_args</span><span class="p">:</span> <span class="n">Dict</span><span class="p">,</span>
<span class="n">llm_args_dict</span><span class="p">:</span> <span class="n">Dict</span><span class="p">,</span>
@ -2530,9 +2629,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1151,9 +1151,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -880,9 +880,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1184,9 +1184,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -792,9 +792,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -809,9 +809,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1008,9 +1008,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -837,9 +837,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -668,9 +668,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -921,9 +921,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -819,9 +819,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -683,9 +683,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -809,9 +809,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -903,9 +903,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -985,9 +985,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1021,9 +1021,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1957,9 +1957,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -2862,9 +2862,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -744,9 +744,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -906,9 +906,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -834,9 +834,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1026,9 +1026,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -953,9 +953,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1062,9 +1062,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -682,9 +682,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -834,9 +834,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -774,9 +774,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -908,9 +908,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1256,9 +1256,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1101,9 +1101,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -741,9 +741,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -891,9 +891,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -2202,9 +2202,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1268,9 +1268,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -2663,9 +2663,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -806,9 +806,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -740,9 +740,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -808,9 +808,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -811,9 +811,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -855,9 +855,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -951,9 +951,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1254,9 +1254,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -926,9 +926,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1482,9 +1482,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1034,9 +1034,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1895,9 +1895,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1165,9 +1165,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -5453,9 +5453,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1112,9 +1112,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1624,9 +1624,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1828,9 +1828,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1440,9 +1440,6 @@
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s1">&#39;pixtral&#39;</span><span class="p">:</span>
<span class="c1"># Hold on to pixel_values and input_ids.</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="n">str_dtype_to_torch</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vision_precision</span><span class="p">)</span>
<span class="n">pixel_values</span> <span class="o">=</span> <span class="n">image</span><span class="p">[</span><span class="s2">&quot;pixel_values&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">input_ids</span> <span class="o">=</span> <span class="n">image</span><span class="p">[</span><span class="s2">&quot;input_ids&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
<span class="c1"># Shape of pixel values from the processor varies with the raw image.</span>
<span class="c1"># So we create a new tensor with a fixed shape as expected by the vision</span>
<span class="c1"># encoder and create a corresponding attention mask.</span>
@ -1450,19 +1447,30 @@
<span class="n">patch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">patch_size</span>
<span class="n">d_min</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">finfo</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span><span class="o">.</span><span class="n">min</span>
<span class="n">num_patches</span> <span class="o">=</span> <span class="p">(</span><span class="n">image_size</span> <span class="o">//</span> <span class="n">patch_size</span><span class="p">)</span>
<span class="n">image</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full</span><span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">image_size</span><span class="p">,</span> <span class="n">image_size</span><span class="p">),</span>
<span class="n">fill_value</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
<span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
<span class="n">attention_mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full</span><span class="p">((</span><span class="mi">1</span><span class="p">,</span> <span class="n">num_patches</span><span class="p">,</span> <span class="n">num_patches</span><span class="p">),</span>
<span class="n">fill_value</span><span class="o">=</span><span class="n">d_min</span><span class="p">,</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
<span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
<span class="n">h</span><span class="p">,</span> <span class="n">w</span> <span class="o">=</span> <span class="n">pixel_values</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">2</span><span class="p">:]</span>
<span class="n">image</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">:</span><span class="n">h</span><span class="p">,</span> <span class="p">:</span><span class="n">w</span><span class="p">]</span> <span class="o">=</span> <span class="n">pixel_values</span>
<span class="n">attention_mask</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="p">:</span><span class="n">h</span> <span class="o">//</span> <span class="n">patch_size</span><span class="p">,</span> <span class="p">:</span><span class="n">w</span> <span class="o">//</span> <span class="n">patch_size</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">padded_image</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full</span><span class="p">(</span>
<span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="n">image_size</span><span class="p">,</span> <span class="n">image_size</span><span class="p">),</span>
<span class="n">fill_value</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
<span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
<span class="n">padded_attention_mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full</span><span class="p">(</span>
<span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">,</span> <span class="n">num_patches</span><span class="p">,</span> <span class="n">num_patches</span><span class="p">),</span>
<span class="n">fill_value</span><span class="o">=</span><span class="n">d_min</span><span class="p">,</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
<span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
<span class="n">h</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">input_ids</span> <span class="o">=</span> <span class="p">[],</span> <span class="p">[],</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">img_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">):</span>
<span class="n">pixel_values</span> <span class="o">=</span> <span class="n">image</span><span class="p">[</span><span class="s2">&quot;pixel_values&quot;</span><span class="p">][</span><span class="n">img_idx</span><span class="p">]</span>
<span class="n">img_h</span><span class="p">,</span> <span class="n">img_w</span> <span class="o">=</span> <span class="n">pixel_values</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">2</span><span class="p">:]</span>
<span class="n">padded_image</span><span class="p">[</span><span class="n">img_idx</span><span class="p">,</span> <span class="p">:,</span> <span class="p">:</span><span class="n">img_h</span><span class="p">,</span> <span class="p">:</span><span class="n">img_w</span><span class="p">]</span> <span class="o">=</span> <span class="n">pixel_values</span>
<span class="n">padded_attention_mask</span><span class="p">[</span><span class="n">img_idx</span><span class="p">,</span> <span class="p">:</span><span class="n">img_h</span> <span class="o">//</span> <span class="n">patch_size</span><span class="p">,</span> <span class="p">:</span><span class="n">img_w</span> <span class="o">//</span>
<span class="n">patch_size</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span>
<span class="n">input_ids</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">image</span><span class="p">[</span><span class="s2">&quot;input_ids&quot;</span><span class="p">][</span><span class="n">img_idx</span><span class="p">])</span>
<span class="n">h</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">img_h</span><span class="p">)</span>
<span class="n">w</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">img_w</span><span class="p">)</span>
<span class="n">image</span> <span class="o">=</span> <span class="n">padded_image</span>
<span class="n">other_vision_inputs</span> <span class="o">=</span> <span class="p">{</span>
<span class="s2">&quot;attention_mask&quot;</span><span class="p">:</span> <span class="n">attention_mask</span><span class="p">,</span>
<span class="s2">&quot;attention_mask&quot;</span><span class="p">:</span> <span class="n">padded_attention_mask</span><span class="p">,</span>
<span class="p">}</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s1">&#39;llava_next&#39;</span><span class="p">:</span>
<span class="nb">input</span> <span class="o">=</span> <span class="n">image</span>
@ -1681,12 +1689,29 @@
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s1">&#39;pixtral&#39;</span><span class="p">:</span>
<span class="n">relevant_patch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">patch_size</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">spatial_merge_size</span>
<span class="n">output_img_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">image_size</span> <span class="o">//</span> <span class="n">relevant_patch_size</span>
<span class="n">visual_features</span> <span class="o">=</span> <span class="n">visual_features</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span>
<span class="n">output_img_size</span><span class="p">,</span> <span class="n">output_img_size</span><span class="p">,</span>
<span class="o">-</span><span class="mi">1</span><span class="p">)[:</span><span class="n">h</span> <span class="o">//</span> <span class="n">relevant_patch_size</span><span class="p">,</span> <span class="p">:</span><span class="n">w</span> <span class="o">//</span>
<span class="n">relevant_patch_size</span><span class="p">]</span><span class="o">.</span><span class="n">flatten</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="c1"># Note: max_h * max_w shall serve as the `tokens_per_task` in ptuning prompt table.</span>
<span class="n">max_h</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="n">h</span><span class="p">)</span> <span class="o">//</span> <span class="n">relevant_patch_size</span>
<span class="n">max_w</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span><span class="n">w</span><span class="p">)</span> <span class="o">//</span> <span class="n">relevant_patch_size</span>
<span class="n">visual_embed_dim</span> <span class="o">=</span> <span class="n">visual_features</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="n">relevant_visual_features</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">,</span>
<span class="n">max_h</span> <span class="o">*</span> <span class="n">max_w</span><span class="p">,</span>
<span class="n">visual_embed_dim</span><span class="p">)</span>
<span class="k">for</span> <span class="n">img_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">):</span>
<span class="n">complete_features</span> <span class="o">=</span> <span class="n">visual_features</span><span class="p">[</span><span class="n">img_idx</span><span class="p">]</span>
<span class="n">complete_features</span> <span class="o">=</span> <span class="n">complete_features</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span>
<span class="n">output_img_size</span><span class="p">,</span> <span class="n">output_img_size</span><span class="p">,</span> <span class="n">visual_embed_dim</span><span class="p">)</span>
<span class="n">relevant_h</span> <span class="o">=</span> <span class="n">h</span><span class="p">[</span><span class="n">img_idx</span><span class="p">]</span> <span class="o">//</span> <span class="n">relevant_patch_size</span>
<span class="n">relevant_w</span> <span class="o">=</span> <span class="n">w</span><span class="p">[</span><span class="n">img_idx</span><span class="p">]</span> <span class="o">//</span> <span class="n">relevant_patch_size</span>
<span class="n">flattened_features</span> <span class="o">=</span> <span class="n">complete_features</span><span class="p">[:</span><span class="n">relevant_h</span><span class="p">,</span> <span class="p">:</span>
<span class="n">relevant_w</span><span class="p">,</span> <span class="p">:]</span><span class="o">.</span><span class="n">flatten</span><span class="p">(</span>
<span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="n">relevant_visual_features</span><span class="p">[</span><span class="n">img_idx</span><span class="p">,</span> <span class="p">:</span><span class="n">relevant_h</span> <span class="o">*</span>
<span class="n">relevant_w</span><span class="p">,</span> <span class="p">:]</span> <span class="o">=</span> <span class="n">flattened_features</span>
<span class="n">visual_features</span> <span class="o">=</span> <span class="n">relevant_visual_features</span>
<span class="n">input_ids</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">ptuning_setup_pixtral</span><span class="p">(</span><span class="n">input_ids</span><span class="o">=</span><span class="n">input_ids</span><span class="p">)</span>
<span class="n">length</span> <span class="o">=</span> <span class="n">input_ids</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="c1"># Note: length is not used for pixtral model downstream. Setting it to a list</span>
<span class="c1"># of length of input_ids causes errors downstream. So, supplying a placeholder.</span>
<span class="n">length</span> <span class="o">=</span> <span class="n">input_ids</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span> <span class="o">==</span> <span class="s1">&#39;llava_next&#39;</span><span class="p">:</span>
<span class="n">visual_features</span> <span class="o">=</span> <span class="n">LlavaNextUtils</span><span class="o">.</span><span class="n">rearrange_image_features</span><span class="p">(</span>
@ -2329,7 +2354,7 @@
<a class="viewcode-back" href="../../../python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.get_rope_index">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">get_rope_index</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">input_ids</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">LongTensor</span><span class="p">,</span>
<span class="n">input_ids</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">IntTensor</span><span class="p">,</span>
<span class="n">image_grid_thw</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">LongTensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">video_grid_thw</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">LongTensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">attention_mask</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
@ -2361,7 +2386,7 @@
<span class="sd"> Here we calculate the text start position_ids as the max vision position_ids plus 1.</span>
<span class="sd"> Args:</span>
<span class="sd"> input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):</span>
<span class="sd"> input_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`):</span>
<span class="sd"> Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide</span>
<span class="sd"> it.</span>
<span class="sd"> image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):</span>
@ -2375,7 +2400,7 @@
<span class="sd"> - 0 for tokens that are **masked**.</span>
<span class="sd"> Returns:</span>
<span class="sd"> position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)</span>
<span class="sd"> position_ids (`torch.IntTensor` of shape `(3, batch_size, sequence_length)`)</span>
<span class="sd"> mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">spatial_merge_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spatial_merge_size</span>
@ -2594,16 +2619,19 @@
<a class="viewcode-back" href="../../../python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_pixtral">[docs]</a>
<span class="k">def</span><span class="w"> </span><span class="nf">ptuning_setup_pixtral</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">input_ids</span><span class="p">):</span>
<span class="c1"># input_ids obtained from processor has token_ids for text as well as image tokens</span>
<span class="c1"># where each image token is represented the same image_token_index (10 for this model).</span>
<span class="c1"># where each image token is represented by the same image_token_index.</span>
<span class="n">image_token_index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">image_token_index</span>
<span class="n">vocab_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">vocab_size</span>
<span class="c1"># Replace all image tokens with a unique token_id &gt; text_vacab_size.</span>
<span class="c1"># This shall be used to lookup the prompt table.</span>
<span class="n">replacer</span> <span class="o">=</span> <span class="n">vocab_size</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">input_ids</span><span class="p">[</span><span class="mi">0</span><span class="p">])):</span>
<span class="k">if</span> <span class="n">input_ids</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="n">i</span><span class="p">]</span> <span class="o">==</span> <span class="n">image_token_index</span><span class="p">:</span>
<span class="n">input_ids</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">replacer</span>
<span class="n">replacer</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">for</span> <span class="n">img_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">):</span>
<span class="c1"># Note: We reset replacer to text_vocab_size for each sample. This is as opposed to doing `replacer = vocab_size + img_idx * tokens_per_task`.</span>
<span class="c1"># That part of the look-up manipulation is done by the `task_ids` input to PromptEmbedding forward.</span>
<span class="n">replacer</span> <span class="o">=</span> <span class="n">vocab_size</span>
<span class="k">for</span> <span class="n">token_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">input_ids</span><span class="p">[</span><span class="n">img_idx</span><span class="p">])):</span>
<span class="k">if</span> <span class="n">input_ids</span><span class="p">[</span><span class="n">img_idx</span><span class="p">][</span><span class="n">token_idx</span><span class="p">]</span> <span class="o">==</span> <span class="n">image_token_index</span><span class="p">:</span>
<span class="n">input_ids</span><span class="p">[</span><span class="n">img_idx</span><span class="p">][</span><span class="n">token_idx</span><span class="p">]</span> <span class="o">=</span> <span class="n">replacer</span>
<span class="n">replacer</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="k">return</span> <span class="n">input_ids</span></div>
@ -2745,7 +2773,24 @@
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">image_path</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">image_path</span> <span class="o">=</span> <span class="n">image_path</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">path_sep</span><span class="p">)</span>
<span class="n">images</span> <span class="o">=</span> <span class="n">load_images</span><span class="p">(</span><span class="n">image_path</span><span class="p">)</span>
<span class="k">elif</span> <span class="s2">&quot;pixtral&quot;</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span><span class="p">:</span>
<span class="k">if</span> <span class="n">image_path</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">image_urls</span> <span class="o">=</span> <span class="p">[</span>
<span class="s2">&quot;https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png&quot;</span><span class="p">,</span>
<span class="s2">&quot;https://www.ilankelman.org/stopsigns/australia.jpg&quot;</span><span class="p">,</span>
<span class="s2">&quot;https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png&quot;</span><span class="p">,</span>
<span class="s2">&quot;https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg&quot;</span><span class="p">,</span>
<span class="p">]</span>
<span class="k">while</span> <span class="nb">len</span><span class="p">(</span><span class="n">image_urls</span><span class="p">)</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">:</span>
<span class="n">image_urls</span> <span class="o">*=</span> <span class="mi">2</span>
<span class="n">image_urls</span> <span class="o">=</span> <span class="n">image_urls</span><span class="p">[:</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">image_path</span> <span class="o">=</span> <span class="s2">&quot;,&quot;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">image_urls</span><span class="p">)</span>
<span class="n">images</span> <span class="o">=</span> <span class="n">load_images</span><span class="p">(</span><span class="n">image_urls</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">image_path</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">image_path</span> <span class="o">=</span> <span class="n">image_path</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">path_sep</span><span class="p">)</span>
<span class="n">images</span> <span class="o">=</span> <span class="n">load_images</span><span class="p">(</span><span class="n">image_path</span><span class="p">)</span>
<span class="n">images</span> <span class="o">=</span> <span class="p">[</span><span class="n">images</span><span class="p">]</span> <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">images</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="k">else</span> <span class="n">images</span>
<span class="k">elif</span> <span class="s2">&quot;nougat&quot;</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span><span class="p">:</span>
<span class="n">filepath</span> <span class="o">=</span> <span class="n">hf_hub_download</span><span class="p">(</span>
<span class="n">repo_id</span><span class="o">=</span><span class="s2">&quot;hf-internal-testing/fixtures_docvqa&quot;</span><span class="p">,</span>
@ -2998,9 +3043,15 @@
<span class="n">post_prompt</span> <span class="o">=</span> <span class="s2">&quot;[/INST]&quot;</span>
<span class="n">prompt</span> <span class="o">=</span> <span class="n">pre_prompt</span> <span class="o">+</span> <span class="n">input_text</span> <span class="o">+</span> <span class="n">post_prompt</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="n">str_dtype_to_torch</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">vision_precision</span><span class="p">)</span>
<span class="n">image</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">processor</span><span class="p">(</span><span class="n">text</span><span class="o">=</span><span class="n">prompt</span><span class="p">,</span>
<span class="n">images</span><span class="o">=</span><span class="p">[</span><span class="n">raw_image</span><span class="p">],</span>
<span class="n">return_tensors</span><span class="o">=</span><span class="s2">&quot;pt&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">image</span> <span class="o">=</span> <span class="p">{</span><span class="s1">&#39;pixel_values&#39;</span><span class="p">:</span> <span class="p">[],</span> <span class="s1">&#39;input_ids&#39;</span><span class="p">:</span> <span class="p">[]}</span>
<span class="k">for</span> <span class="n">img_idx</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">):</span>
<span class="n">image_info</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">processor</span><span class="p">(</span><span class="n">text</span><span class="o">=</span><span class="n">prompt</span><span class="p">,</span>
<span class="n">images</span><span class="o">=</span><span class="p">[</span><span class="n">raw_image</span><span class="p">[</span><span class="n">img_idx</span><span class="p">]],</span>
<span class="n">return_tensors</span><span class="o">=</span><span class="s2">&quot;pt&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">image</span><span class="p">[</span><span class="s1">&#39;pixel_values&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">image_info</span><span class="p">[</span><span class="s1">&#39;pixel_values&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">to</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">device</span><span class="p">))</span>
<span class="n">image</span><span class="p">[</span><span class="s1">&#39;input_ids&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">image_info</span><span class="p">[</span><span class="s1">&#39;input_ids&#39;</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">to</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">device</span><span class="p">))</span>
<span class="k">elif</span> <span class="s1">&#39;internvl&#39;</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span><span class="p">:</span>
<span class="n">pre_prompt</span> <span class="o">=</span> <span class="s2">&quot;&lt;|system|&gt;</span><span class="se">\n</span><span class="s2">你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型英文名叫InternVL, 是一个有用无害的人工智能助手。&lt;|end|&gt;&lt;|user|&gt;</span><span class="se">\n</span><span class="s2">&lt;image&gt;</span><span class="se">\n</span><span class="s2">&quot;</span>
@ -3204,7 +3255,9 @@
<span class="n">image</span> <span class="o">=</span> <span class="n">image</span><span class="o">.</span><span class="n">expand</span><span class="p">(</span>
<span class="nb">min</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batch_size</span><span class="p">,</span> <span class="nb">len</span><span class="p">(</span><span class="n">input_text</span><span class="p">)),</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span>
<span class="o">-</span><span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">()</span>
<span class="k">if</span> <span class="n">image</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># Note: For pixtral model, image is a dict with each value being a list of tensors.</span>
<span class="c1"># Moving to device is handled above. So, it&#39;s safe to skip this for pixtral.</span>
<span class="k">if</span> <span class="n">image</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="s1">&#39;pixtral&#39;</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_type</span><span class="p">:</span>
<span class="n">image</span> <span class="o">=</span> <span class="n">image</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
<span class="c1"># Generate decoder_input_ids for enc-dec models</span>
<span class="c1"># Custom prompts can be added as:</span>
@ -3354,9 +3407,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -972,9 +972,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -50,7 +50,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -60,7 +60,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1106,9 +1106,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -4,24 +4,24 @@
TensorRT-LLM has a Model Definition API that can be used to define
Large Language Models. This API is built on top of the powerful
[TensorRT Python API](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html#)
[TensorRT Python API](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/index.html)
to create graph representations of deep neural networks in TensorRT. To become
familiar with the core concepts of the TensorRT API, refer to the
[Core Concepts](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/coreConcepts.html)
[Core Concepts](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/coreConcepts.html)
section of the TensorRT documentation before proceeding further.
In TensorRT-LLM, the [`tensorrt_llm.Builder`](source:tensorrt_llm/builder.py) class
contains a
[`tensorrt.Builder`](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder)
[`tensorrt.Builder`](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Core/Builder.html#id1)
object. That instance is used in the `tensorrt_llm.Builder.create_network`
method to create an instance of the
[`tensorrt.INetworkDefinition`](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Network.html#tensorrt.INetworkDefinition)
[`tensorrt.INetworkDefinition`](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Graph/Network.html#tensorrt.INetworkDefinition)
class. The `INetworkDefinition` object can then be populated using the free
functions defined in the
[`tensorrt_llm.functional`](source:tensorrt_llm/functional.py).
A simple example of such a free function is `tensorrt_llm.activation` that inserts a
[`tensorrt.IActivationLayer`](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Layers.html#tensorrt.IActivationLayer)
[`tensorrt.IActivationLayer`](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Graph/Layers.html#tensorrt.IActivationLayer)
node in the graph of the model:
```python
@ -56,23 +56,23 @@ def silu(input: Tensor) -> Tensor:
When the TensorRT-LLM's Model Definition API is utilized, a graph of the network is
assembled. The graph can later be traversed or transformed using the graph
traversal API exposed by the
[`tensorrt.ILayer`](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/LayerBase.html#tensorrt.ILayer)
[`tensorrt.ILayer`](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Graph/LayerBase.html#tensorrt.ILayer)
class. That graph will also be optimized by TensorRT during the compilation of
the engine, as explained in the next section.
# Compilation
Once populated, the instance of the
[`tensorrt.INetworkDefinition`](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Network.html#tensorrt.INetworkDefinition),
[`tensorrt.INetworkDefinition`](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Graph/Network.html#tensorrt.INetworkDefinition),
can be compiled into an efficient engine by the
[`tensorrt.Builder`](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder)
[`tensorrt.Builder`](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Core/Builder.html#id1)
In TensorRT-LLM, it is done through the `build_engine` member function of the
`tensorrt_llm.Builder` class that calls the
[`build_serialized_network`](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder.build_serialized_network)
[`build_serialized_network`](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Core/Builder.html#tensorrt.Builder.build_serialized_network
method of the
[`tensorrt.Builder`](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder)
[`tensorrt.Builder`](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Core/Builder.html#id1)
object. That call, if everything works as expected, produces an instance of the
[`tensorrt.IHostMemory`](https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/FoundationalTypes/HostMemory.html#tensorrt.IHostMemory)
[`tensorrt.IHostMemory`](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/FoundationalTypes/HostMemory.html#tensorrt.IHostMemory)
class. That object is an optimized TensorRT engine that can be stored as a
binary file.

View File

@ -4,7 +4,7 @@
# H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token
TensorRT-LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x max throughput and 4.4x faster 1st token latency than A100**. H100 FP8 is able to achieve over 10,000 output tok/s at [peak throughput](https://nvidia.github.io/TensorRT-LLM/performance.html#h100-gpus-fp8) for 64 concurrent requests, while maintaining a 1st token latency of 100ms. For [min-latency](https://nvidia.github.io/TensorRT-LLM/performance.html#id1) applications, TRT-LLM H100 can achieve less than 10ms to 1st token latency.
TensorRT-LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x max throughput and 4.4x faster 1st token latency than A100**. H100 FP8 is able to achieve over 10,000 output tok/s at peak throughput for 64 concurrent requests, while maintaining a 1st token latency of 100ms. For min-latency applications, TRT-LLM H100 can achieve less than 10ms to 1st token latency.
<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/blogs/media/TRT_LLM_v0-5-0_H100vA100_tps.png?raw=true" alt="max throughput" width="500" height="auto">
@ -28,7 +28,7 @@ TensorRT-LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x
<sub>FP8 H100, FP16 A100, SXM 80GB GPUs, TP1, ISL/OSL's provided, TensorRT-LLM v0.5.0., TensorRT 9.1</sub>
The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT-LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/performance.html#performance-of-tensorrt-llm)
The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT-LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html)
Stay tuned for a highlight on Llama coming soon!

View File

@ -21,7 +21,7 @@ TensorRT-LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news
<sup>*(1) Largest batch supported on given TP configuration by power of 2.*</sup> <sup>*(2) TP = Tensor Parallelism*</sup>
Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT-LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/performance.html).
Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT-LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html).
### H200 vs H100

View File

@ -2,37 +2,39 @@
by NVIDIA TensorRT-LLM team
## Table of Contents
- [Background](#background)
- [Implementation Configuration](#implementation-configuration)
- [Workload Profile](#workload-profile)
- [Model Architecture](#model-architecture)
- [Precision Strategy](#precision-strategy)
- [Parallelism Strategy](#parallelism-strategy)
- [Everything in One Diagram](#everything-in-one-diagram)
- [Key Optimizations](#key-optimizations)
- [System Level optimizations](#system-level-optimizations)
- [CUDA Graph & Programmatic Dependent Launch](#cuda-graph--programmatic-dependent-launch)
- [MTP](#mtp)
- [Autoregressive MTP Layers](#autoregressive-mtp-layers)
- [Relax Acceptance Verification](#relax-acceptance-verification)
- [Multi-streams](#multi-streams)
- [Sparse Experts as GEMMs](#sparse-experts-as-gemms-only-works-when-moe_backendcutlass)
- [Re-balanced the sparse experts](#re-balanced-the-sparse-experts)
- [Mixed ETP](#mixed-etp)
- [Smart Router](#smart-router)
- [Kernel Level optimizations](#kernel-level-optimizations)
- [Attention Kernel](#attention-kernel)
- [Grouped GEMM](#grouped-gemm)
- [CUTLASS Backend](#cutlass-backend-default-backend)
- [TRTLLM Backend](#trtllm-backend)
- [Communication Kernel](#communication-kernel)
- [Dense GEMM optimization](#dense-gemm-optimization)
- [Fuse_A_GEMM](#fuse_a_gemm)
- [RouterGEMM](#routergemm)
- [Kernel fusion](#kernel-fusion)
- [How to reproduce](#how-to-reproduce)
- [Future Works](#future-works)
- [Acknowledgment](#acknowledgment)
- [Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs](#pushing-latency-boundaries-optimizing-deepseek-r1-performance-on-nvidia-b200-gpus)
- [Table of Contents](#table-of-contents)
- [Background](#background)
- [Implementation Configuration](#implementation-configuration)
- [Workload Profile](#workload-profile)
- [Model Architecture](#model-architecture)
- [Precision Strategy](#precision-strategy)
- [Parallelism Strategy](#parallelism-strategy)
- [Everything in One Diagram](#everything-in-one-diagram)
- [Key Optimizations](#key-optimizations)
- [System Level optimizations](#system-level-optimizations)
- [CUDA Graph \& Programmatic Dependent Launch](#cuda-graph--programmatic-dependent-launch)
- [MTP](#mtp)
- [Autoregressive MTP Layers](#autoregressive-mtp-layers)
- [Relax Acceptance Verification](#relax-acceptance-verification)
- [Multi-streams](#multi-streams)
- [Sparse Experts as GEMMs (only works when moe\_backend=CUTLASS)](#sparse-experts-as-gemms-only-works-when-moe_backendcutlass)
- [Re-balanced the sparse experts](#re-balanced-the-sparse-experts)
- [Mixed ETP](#mixed-etp)
- [Smart Router](#smart-router)
- [Kernel Level optimizations](#kernel-level-optimizations)
- [Attention Kernel](#attention-kernel)
- [Grouped GEMM](#grouped-gemm)
- [CUTLASS Backend (default backend)](#cutlass-backend-default-backend)
- [TRTLLM Backend](#trtllm-backend)
- [Communication Kernel](#communication-kernel)
- [Dense GEMM optimization](#dense-gemm-optimization)
- [Fuse\_A\_GEMM](#fuse_a_gemm)
- [RouterGEMM](#routergemm)
- [Kernel fusion](#kernel-fusion)
- [How to reproduce](#how-to-reproduce)
- [Future Works](#future-works)
- [Acknowledgment](#acknowledgment)
## Background
Recent advancements in Large Language Reasoning Models have demonstrated remarkable success, while creating new deployment challenges. A critical challenge emerges from extended Output Sequence Lengths (OSL) due to complex "thinking and reasoning" processes. Longer OSL demands stricter Token-to-Token Latency (TTL) requirements, often forcing concurrency limitations. The most extreme case, single concurrency (min-latency scenario) , becomes particularly challenging for real-time applications.

View File

@ -1,26 +1,28 @@
# DeepSeek R1 MTP Implementation and Optimization
by NVIDIA TensorRT-LLM team
## Table of Contents
- [MTP for inference](#mtp-for-inference)
- [Background](#background)
- [MTP Vanilla](#mtp-vanilla)
- [MTP Eagle](#mtp-eagle)
- [MTP implementation in TensorRT-LLM](#mtp-implementation-in-tensorrt-llm)
- [Basic Implementation](#basic-implementation)
- [MTP Modules](#mtp-modules)
- [Attention for MTP](#attention-for-mtp)
- [How to run DeepSeek models with MTP](#how-to-run-deepseek-models-with-mtp)
- [MTP optimization - Relaxed Acceptance](#mtp-optimization---relaxed-acceptance)
- [Relaxed Acceptance](#relaxed-acceptance)
- [How to run the DeepSeek-R1 model with Relaxed Acceptance](#how-to-run-the-deepseek-r1-model-with-relaxed-acceptance)
- [Evaluation](#evaluation)
- [Achieving speedup with MTP speculative decoding](#achieving-speedup-with-mtp-speculative-decoding)
- [Accuracy studies for Relaxed Acceptance](#accuracy-studies-for-relaxed-acceptance)
- [Future Works](#future-works)
- [Tree-based speculative decoding support](#tree-based-speculative-decoding-support)
- [Eagle3 support](#eagle3-support)
- [Fix known issues](#fix-known-issues)
- [Acknowledgment](#acknowledgment)
- [DeepSeek R1 MTP Implementation and Optimization](#deepseek-r1-mtp-implementation-and-optimization)
- [Table of Contents](#table-of-contents)
- [MTP for inference](#mtp-for-inference)
- [Background](#background)
- [MTP Vanilla](#mtp-vanilla)
- [MTP Eagle](#mtp-eagle)
- [MTP implementation in TensorRT-LLM](#mtp-implementation-in-tensorrt-llm)
- [Basic Implementation](#basic-implementation)
- [MTP Modules](#mtp-modules)
- [Attention for MTP](#attention-for-mtp)
- [How to run DeepSeek models with MTP](#how-to-run-deepseek-models-with-mtp)
- [MTP optimization - Relaxed Acceptance](#mtp-optimization---relaxed-acceptance)
- [Relaxed Acceptance](#relaxed-acceptance)
- [How to run the DeepSeek-R1 model with Relaxed Acceptance](#how-to-run-the-deepseek-r1-model-with-relaxed-acceptance)
- [Evaluation](#evaluation)
- [Achieving speedup with MTP speculative decoding](#achieving-speedup-with-mtp-speculative-decoding)
- [Accuracy studies for Relaxed Acceptance](#accuracy-studies-for-relaxed-acceptance)
- [Future Works](#future-works)
- [Tree-based speculative decoding support](#tree-based-speculative-decoding-support)
- [Eagle3 support](#eagle3-support)
- [Fix known issues](#fix-known-issues)
- [Acknowledgment](#acknowledgment)
TensorRT-LLM achieves world-record inference performance for DeepSeek-R1 on NVIDIA Blackwell GPUs, where Multi-Token Prediction (MTP) delivers a significant speedup. In our [previous blog post](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md), we discussed the key optimizations that enable the outstanding inference latency of the DeepSeek-R1 model. This article dives deeper into the implementation and optimization of MTP in TensorRT-LLM.

View File

@ -2,6 +2,8 @@
By NVIDIA TensorRT-LLM team
## Table of Contents
- [Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers](#optimizing-deepseek-r1-throughput-on-nvidia-blackwell-gpus-a-deep-dive-for-developers)
- [Table of Contents](#table-of-contents)
- [Introduction](#introduction)
- [Precision strategy](#precision-strategy)
- [Parallel strategy](#parallel-strategy)

View File

@ -0,0 +1,715 @@
# Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)
By NVIDIA TensorRT-LLM Team
## Table of Contents
- [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](#scaling-expert-parallelism-in-tensorrt-llmpart-1-design-and-implementation-of-large-scale-ep)
- [Table of Contents](#table-of-contents)
- [Motivation for large-scale EP](#motivation-for-large-scale-ep)
- [Observations over one machine translation dataset](#observations-over-one-machine-translation-dataset)
- [Observation over GSM8K dataset](#observation-over-gsm8k-dataset)
- [High-level design introduction](#high-level-design-introduction)
- [EP communication kernels](#ep-communication-kernels)
- [Motivation of EP communication kernels for GB200](#motivation-of-ep-communication-kernels-for-gb200)
- [EP communication kernels implementation](#ep-communication-kernels-implementation)
- [EP Load Balancer](#ep-load-balancer)
- [Python Interface](#python-interface)
- [C++ extension](#c-extension)
- [Core implementations of host side logics](#core-implementations-of-host-side-logics)
- [Core implementations of GPU side logics](#core-implementations-of-gpu-side-logics)
- [Online EP Load Balancer](#online-ep-load-balancer)
- [Offline EP Load Balancer](#offline-ep-load-balancer)
- [E2E evaluation](#e2e-evaluation)
- [The effect of EP Load Balancer](#the-effect-of-ep-load-balancer)
- [Offline EP Load Balancer](#offline-ep-load-balancer-1)
- [Online EP Load Balancer](#online-ep-load-balancer-1)
- [Performance study](#performance-study)
- [Reproducing steps](#reproducing-steps)
- [The effect of EP Load Balancer](#the-effect-of-ep-load-balancer-1)
- [Step 1: Run inference and collect statistics](#step-1-run-inference-and-collect-statistics)
- [Step 2: Generate the EPLB configuration](#step-2-generate-the-eplb-configuration)
- [Step 3: Run inference with the EPLB configuration](#step-3-run-inference-with-the-eplb-configuration)
- [Miscellaneous](#miscellaneous)
- [Expanded thoughts](#expanded-thoughts)
- [Acknowledgement](#acknowledgement)
The development of model like DeepSeek-V3/R1, which use large-scale fine-grained Mixture-of-Experts (MoE) designs, has significantly advanced open-source model quality. Newly released open-source models such as LLaMA4 and Qwen3 also adopt a similar large-scale fine-grained MoE design principle. However, large-scale MoE models introduce new challenges for inference systems, including high memory demands and inherent expert-level workload imbalance.
In the past, we have shared TensorRT-LLMs optimization experience to [push the latency boundary](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) of DeepSeek R1 model, [the implementation and optimization of MTP](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md)(Multi-Token Prediction) and [the optimizations for DeepSeek R1 throughput oriented performance](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md).
The DeepSeek team has also shared their valuable experience and practice on how to optimize this kind of large-scale Expert Parallelism (EP) model, including [DeepEP](https://github.com/deepseek-ai/DeepEP) and [EPLB](https://github.com/deepseek-ai/EPLB). Also, the DeepSeek team has shared their concrete design considerations in [this](https://arxiv.org/abs/2412.19437) tech report. On top of those great sharings, there are also nice community efforts to implement large-scale EP in other inference engines, such as [this](https://lmsys.org/blog/2025-05-05-large-scale-ep/) effort from the SGLang team.
In this tech blog, we will introduce the details of the design and implementation to support E2E large-scale EP in TensorRT-LLM. This blog post mainly covers the following:
* How to leverage NVIDIA GB200 Multi-Node NVLink (MNNVL) HW features to implement high-performance communication kernels.
* How to design and implement an online expert workload balancer to dynamically balance the expert load distribution and adapt to the changes of online traffic patterns. We present:
* The empirical data analysis demonstrating the need to do so.
* The implementation of the online traffic data statistic module.
* The design and implementation of the replication/placement strategy.
* The MoE weight load/re-distributer to balance the online workload across multiple GPUs.
* The changes needed to the MoE router and computation module to adapt to the expert load balancer needs.
* Some preliminary data demonstrating the effectiveness of the current implementation in TensorRT-LLM.
In future tech blogs, we will also cover the following topics:
* The introduction of performance tuning and optimization for TensorRT-LLM large-scale EP GB200 implementation.
* How to implement efficient large-scale EP support for B200/Hopper and other NVIDIA GPUs without MNNVL.
* The best practices to leverage large-scale EP and get performance gains.
* How to combine large-scale EP with other system optimization techniques.
Even if, in this tech blog, we focus on TensorRT-LLM, we believe the core ideas and implementation can also be applied to other inference engines to help the inference performance on NVIDIA GPUs. Also, with the help of the community, we would like to figure out how to better modularize the current TensorRT-LLM large-scale EP implementation and make it more easily reusable by the community.
Finally, in this tech blog, there are implementation details which are targeted towards the GB200 system, such as the communication components leveraging the GB200 MNNVL inter-GPU connection, and the MoE weight load/re-distributer module leveraging the high bandwidth C2C connection between Grace CPU and Blackwell GPU. Nevertheless, the overall design principle and software architecture can still apply to non-GB200 NVIDIA GPU systems. To facilitate the extension to other non-GB200 system, we have, on purpose, paid attention to the generalization of the design and implementation. These changes should be easily composable with other existing components.
## Motivation for large-scale EP
The main motivation of introducing large-scale EP (here means EP \> 8\) comes from the following system considerations:
* We expect to reduce the execution latency thanks to the increased aggregated memory bandwidth to load the expert weights.
* We expect to increase the effective batch size to saturate the GPU computing power.
Note that **when the E2E execution time is dominated by the MoE GroupGEMM computation, by introducing large-scale EP, it is expected to see clear performance benefits. But if the E2E execution time is not dominated by the MoE GroupGEMM computation, then large-scale EP may bring limited performance benefit.**
Also there isn't free lunch in the system design. When the EP size increases up to greater than 8 (sometimes even less than 8), due to the sparsity execution nature of MoE models, it can inherently trigger the EP-level workload imbalance issue.
And here are some empirical observations based on some datasets (*all the analyses below are done with the **DeepSeek R1 model**, on **32 GB200 GPUs**).*
### Observations over one machine translation dataset
Firstly lets have an overview of the overall imbalance issues across layers:
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture1.png">
</figure>
</div>
<p align="center"><sub><em>Figure 1: The routed token count from rank 0 to all the ranks(including rank 0), for decode iteration 1950, and all the MoE layers</em></sub></p>
In Figure 1, it can be seen clearly that for the MoE in layer 36, many more tokens are sent from **rank 0** to **rank 13\.**
If we zoom on the MoE in the layer 36 and record its activated expert rank distribution, there clearly is a rank that is more heavily activated:
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture2.png">
</figure>
</div>
<p align="center"><sub><em>Figure 2: The tokens received for each expert rank for layer 36</em></sub></p>
If we flatten the data to see the routed tokens for each expert, we can see that a few experts are more active than others:
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture3.png">
</figure>
</div>
<p align="center"><sub><em>Figure 3: The tokens received for each expert for layer 36</em></sub></p>
It is also interesting to see that this kind of imbalance issue is very stable across multiple iterations, as shown on the following figure:
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture4.png">
</figure>
</div>
<p align="center"><sub><em>Figure 4: The accumulated token counts received for each expert for layer 36, within 50 decode steps, and the local batch size=256.</em></sub></p>
Clearly, the hot experts in Figure 4 are actually the same as in Figure 3 which only have data for a single decode iteration.
We have also done the duration-based analysis for local batch size=1 which correspond to a single request with observing the similar pattern:
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture5.png">
</figure>
</div>
<p align="center"><sub><em>Figure 5: The accumulated token counts received for each expert for layer 36, within 400 decode iterations, and the local batch size \= 1\.</em></sub></p>
To conclude the findings from this study over this machine translation dataset, we could say that:
* There are hot spots in some layers where the workload of some EP ranks can be much higher than others.
* This may be caused by the hottest expert or some hot experts to be located on the same rank.
* The routed token distributions can be the same for tens to hundreds of iteration steps or even more.
* For the execution of a single request, it also has the same hot experts between steps.
And another natural question is whether the above observation can change significantly on other datasets. So we have done a similar analysis with the GSM8K dataset.
### Observation over GSM8K dataset
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture6.png">
</figure>
</div>
<p align="center"><sub><em>Figure 6: The routed token count from rank 0 to all the ranks, for iteration 1950, and all the MoE layers</em></sub></p>
In Figure 6, compared with Figure 1, it can be seen that for GSM8K, the hot layer becomes layer 57 instead of layer 36\. Then what about the concrete status of layer 36 for the GSM8K dataset?
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture7.png">
</figure>
</div>
<p align="center"><sub><em>Figure 7: routed token counts from EP rank 0 to other EP ranks, still taking the iteration 1950, MoE layer 36 as the example</em></sub></p>
Clearly from Figure 7, it can be observed that the workload imbalance is different from what was observed for the different dataset (in Figure 2).
Based on Figure 8, it can be observed that the workload imbalance is relatively stable across multiple iterations on the GSM8K dataset too. It is the same as the previous machine translation dataset.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture8.png">
</figure>
</div>
<p align="center"><sub><em>Figure 8: The accumulated token counts sent from EP Rank 0 to all the ranks, for MoE layer 57 within 50 decode steps, local batch size=256</em></sub></p>
If we flatten the EP rank level data to expert-level data, we can have the following figure.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture9.png">
</figure>
</div>
<p align="center"><sub><em>Figure 9: The accumulated token counts received for each expert for layer 57, within 50 decode steps, and the local batch size=256.</em></sub></p>
The similar imbalance pattern also exists for a single request.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture10.png">
</figure>
</div>
<p align="center"><sub><em>Figure 10: The accumulated token counts received for each expert for layer 57, within 400 decode steps, for a single request</em></sub></p>
If we use another request, then we can still observe the expert imbalance issue, while the hot experts can be different with some in common (in this example it is expert 10).
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture11.png">
</figure>
</div>
<p align="center"><sub><em>Figure 11: The accumulated token counts received for each expert for layer 57, within 400 decode steps, for a single request</em></sub></p>
So combining the data analysis of two datasets, we have the following findings:
* EP level workload imbalance issue is common for large-scale EP inference on multiple datasets. And the EP imbalance severity can be different per layer. Also the EP imbalance issue is dataset sensitive.
* The EP rank level imbalance issue can be caused by a certain hottest expert or multiple hot experts staying on the same EP rank.
* The EP rank imbalance distribution is relatively stable across tens to hundreds of iterations.
* Though there is time-dimension stability of EP rank imbalance distribution, clearly different requests can have different EP imbalance distribution.
Based on these findings, they can lead to our design consideration of TensorRT-LLMs large-scale EP implementation:
* By design the EP imbalance issue needs to be considered to assure great E2E performance.
* Online EP Load Balancer(rather than only a Offline EP Load Balancer implementation) based on the real-time online request traffic is essential to ensure the robustness of EP balancer.
* The time-dimension stability of EP rank imbalance distribution can be leveraged to re-distribute the MoE weights to different EP ranks in an efficient manner.
In the next section we will illustrate the high-level design.
## High-level design introduction
Based on the detailed analysis and study in section [Motivation of large-scale EP](#motivation-of-large-scale-ep), it can clearly be observed that expert imbalance in EP is a common pattern for large-scale EP. This EP imbalance can clearly impede the overall system performance in the following ways:
* The hot EP rank will consume more memory (for activations) which can limit the effective max batch size scheduled during the inference process.
* More data will be sent to/received from the hot EP rank.
Those issues can clearly result into a system-level congestion effect in which the hot EP rank will delay the overall E2E execution.
To make sure large-scale EP can run well, careful considerations are needed to minimize the EP imbalance issue. The overall design is as follows:
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture12.png">
</figure>
</div>
<p align="center"><sub><em>Figure 12: the high-level design of TensorRT-LLM large-scale EP</em></sub></p>
In this design, there are both CPU and GPU side logics:
* CPU side
* Implement the Replication \& Placement algorithms **(Replication \& Placement Compute** component) to achieve a more balanced EP strategy. Those are rather classical algorithms for which CPU computation is more suitable. Furthermore, by offloading this computation to the CPU, the interference with the GPU can be reduced. In the future, machine-learning based algorithms may also be explored and additional design consideration may be needed. The **Replication \& Placement Compute** component will generate the **“Placement Info”** which will then be consumed by both the GPU **Routing** logic and the CPU **Update Weights \& Placement** component. The **Replication \& Placement Compute** component will consume the **Statistics Data** generated by the **Statistics** component which runs on the GPU.
* Orchestrate the process (**Update Weights \& Placemen**t component) to update and reload the MoE weights from CPU host memory to GPU device memory. This component will also consume the **Placement Info** generated by the **Replication \& Placement Compute** component. Our scalable design allows us to reload the MoE weights from remote GPU memory via MNNVL or NIC.
* GPU side
* This is the main execution workflow of inference. The following new GPU components are introduced with our design:
* EP communication kernels. In Figure 11, those are the **Dispatch** and **Combine** components.
* Online traffic data statistics collector (the **Statistics** component). This component collects the **Statistics Data** which is to be consumed by the **Replication \& Placement Compute** component.
* The MoE router logic (the **Routing** component). It sends tokens to the activated experts. It needs to be adjusted to support the dynamic placement of MoE weights. It also consumes the **Placement Info** generated by the **Replication \& Placement Compute** component.
* The MoE computation logic (the **MoE** component) also needs to be adjusted correspondingly.
* Careful synchronization between CPU and GPU components is needed to ensure the validity of the entire execution process ; particularly, to avoid hangs, as well as invalid or sub-optimal executions.
For the **Update Weights \& Placemen**t component, we identified two design choices:
* Bulk approach
* In this approach, when the MoE weight redistribution logic starts, the inference taking place on the current serving instance will have to be paused until the MoE weight redistribution process finishes. We estimate that it can lead to approximately **0.5 \~** **1 second** online serving stalls ; causing in the worst-cases request timeouts. This kind of timeout or stalls can be mitigated at the system level by routing the requests to other serving instances or just request replays.
* Layer-wise approach
* In this approach, the MoE weight redistribution is done layer by layer such that at each decode iteration only certain layers (it can be configured) will be impacted by a redistribution of their MoE weights. With this design, it will take several iterations to re-balance the MoE weights of all the layers. We expect this approach to have almost no impact on the user experience.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture13.png">
</figure>
</div>
<p align="center"><sub><em>Figure 13: One example of the layer-wise MoE weight re-distribution</em></sub></p>
In our current system, we choose to implement **the layer-wise approach** to minimize the impact on the online user experience. The bulk approach should be much easier to implement and we will not discuss it in this tech blog.
To implement the layer-wise approach properly, we need to carefully evaluate the capability of different underlying HWs to decide on the concrete implementation.
Lets use GB200 as an example. In Figure 14, we illustrate the communication bandwidth of different HW elements in a GB200 node.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture14.png" width="500" >
</figure>
</div>
<p align="center"><sub><em>Figure 14: high-level topology of GB200 system</em></sub></p>
Using the DeepSeek R1 model as an example, with FP4 precision, each MoE expert occupies 24MiB of memory space. There are 256 experts per layer. In total, that's 58 MoE layers, plus 1 MTP layer. So the maximum amount of MoE weights which need to be redistributed, to achieve EP balance, is 348GiB.
One GB200 node has 480GB LPDDR5X memory for each Grace CPU. In total, that's 960GB of host memory across a NUMA domain. One GB200 node can host the entire MoE weights of a model like the DeepSeek R1 LLM in its CPU host memory. Based on it, the MoE weight redistribution can be done by moving the corresponding MoE weights from CPU host memory to GPU device memory.
Let's assume that we target **50ms** inter-token-latency (ITL) as our main latency constraint. Using back-of-the-envelope calculation, it can be computed that the amount of expert weights which can be moved from the MoE weight pool (can be kept in Grace CPU memory or GPU memory on another node) to the Blackwell GPU (to do the real MoE inference) for each decode iteration is:
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture15.png" width="300" >
</figure>
</div>
<p align="center"><sub><em>Figure 15: The theoretical expert count to be updated for each iteration with following 50ms ITL constraints, by using different HW as pools to store the full MoE weight</em></sub></p>
Based on this analysis, and, if we rely on the Grace CPU memory on each node to store the MoE weight pool, for each decode iteration, the weights of up to 300 experts can be redistributed to each GPU on the same GB200 node.
Assuming our goal is to finish the MoE weight re-balancing for the full model within 5 decode iterations, here are some more concrete use-case studies:
* Use-case 1 (with balanced expert placement and no expert replication)
* 64 GPUs with 4 Experts per GPU
* 58 layers, 232 Experts per GPU
* Need 47 Expert Update / Iter, all the methods can satisfy the latency goal.
* Use-case 2 (with both balanced expert placement and replication)
* 64 GPUs or 72 GPUs with 5 Experts per GPU
* 58 layers, 290 Experts per GPU
* Need 58 Expert Update / Iter, all the methods can satisfy the latency goal.
* Use-case 3 (with both balanced expert placement and replication)
* 36 GPUs with 8 Experts per GPU
* 58 layers, 464 Experts per GPU
* Need 93 Expert Update / Iter, all the method can satisfy the latency goal.
In summary, based on the theoretical analysis, using Grace CPU memory as the pool to hold the full size MoE weights should allow us to achieve the EP (Expert-Parallelism) re-balancing within 5 decode iterations. If we relax the requirements to 10 or more iterations, there can be even more system implementation flexibility.
Next we will introduce the implementation details of our large-scale EP system.
## EP communication kernels
We have evaluated multiple ways of implementing the EP communication kernels needed by large-scale EP, including DeepEP, other solutions and the development of an approach from scratch.
The current technical decision is:
* For GB200, we implemented a new set of [custom EP communication kernels](https://github.com/NVIDIA/TensorRT-LLM/pull/3504).
* For non-GB200 systems (such as B200 or Hopper), we chose to integrate DeepEP directly, with some potential enhancement.
The considerations are:
* DeepEP is a great piece of work done by the DeepSeek team. When we started the TensorRT-LLM large-scale EP efforts, our first focus was on GB200. We chose to implement our own custom EP communication kernels as it was easier to introduce optimizations requiring the GB200 MNNVL capability. Also, based on our current evaluation, DeepEP does not provide CUDA graph compatibility for all the scenarios. We believe that CUDA graph is needed for the scenario we are interested in.
* When we started the efforts to enable large-scale EP on Hopper, we concluded that DeepEP could be adapted and meet our needs on this platform. We plan to extend DeepEP to work for B200 in the future.
We are also actively evaluating the possibility of consolidating GB200 and non-GB200 EP communication kernels into a single solution to make the system simpler, and we will keep the community posted on the status.
Now lets talk a little bit more about the optimizations introduced into the custom EP communication kernel implementations.
### Motivation of EP communication kernels for GB200
In the Decoding Phase with Prefill-Decoding (PD) separation, we observed that the batch size may not be very large, such that latency is a significant concern. In this context, compatibility with CUDA Graph is a strong requirement.
[NCCL](https://github.com/NVIDIA/nccl) is a great GPU communication library which provides highly efficient communication kernels and primitives.
For now, its Send and Recv operations require the data size to be explicitly specified when invoking with `ncclSend`/`ncclRecv`.
However, in large expert parallel (large-EP) scenarios, the data size to be transferred is determined dynamically based on the model's output at each iteration.
With the current NCCL's communication interface, an explicit synchronization is required to send the communication size back to the CPU and launch NCCL calls from the CPU with the corresponding data size. This would break CUDA Graph compatibility.
This limitation has forced us to develop high performance communication kernels compatible with CUDA graph and that can accept communication sizes directly from GPU memory.
We also wanted those kernels, for GB200, to take of advantage of the MNNVL's memory bandwidth.
### EP communication kernels implementation
Our kernels adopt a communication approach similar to NCCLs LL128 primitive. As this approach strikes a good balance between latency and bandwidth, it is well-suited for LLM inference.
Our custom kernels can read the communication size directly from GPU memory and are compatible with CUDA Graph even when the data size varies across runs.
In our implementation, we use the CUDA's Driver API to establish a peer-to-peer (P2P) buffer via MNNVL as a workspace.
Each GPU can access the workspace of other GPUs. The workspace is divided into multiple channels, each assigned to a remote GPU as a write buffer.
Those write buffers are used in a FIFO manner, with flags used to synchronize FIFO status and avoid data corruption.
More details can be found in [PR 3504](https://github.com/NVIDIA/TensorRT-LLM/pull/3504).
## EP Load Balancer
TensorRT-LLM implements a set of functionalities to achieve EP Load Balancing. There are several key components:
### Python Interface
The Python interface layer provides a user-friendly PyTorch/Python native interface to access the MoE Load Balancing implementations, such as the Python wrapper for the GPU/CPU synchronization logics and the online data statistics collection, and other logics implemented in 4.2 to 4.4.
### C++ extension
The C++ extension acts as the bridge between the PyTorch/Python interface and the C++/CUDA core implementations.
### Core implementations of the host logic
The host-side core logic implements the following key parts:
* Load balancing algorithms
* Replication algorithm
* Placement algorithm
* Orchestration logic of MoE weight updates
* MoE weight update logic
### Core implementations of the GPU logic
The GPU core logic contains the following components:
* Online traffic statistics collection
* To reduce the CPU-GPU back-and-forth synchronization cost, we choose to implement the online traffic statistic logic on the GPU side.
* Expert routing logic
* The MoE routing logic needs to be enhanced to adapt with the dynamic EP balance impact.
There are GPU/CPU synchronization components implemented. More details can be found in [PR 4384](https://github.com/NVIDIA/TensorRT-LLM/pull/4384) and [PR 4495](https://github.com/NVIDIA/TensorRT-LLM/pull/4495).
Based on these core utilities, there are two versions of EP Load Balancer in TensorRT-LLM: Offline EP Load Balancer and Online EP Load Balancer.
### Online EP Load Balancer
For production deployment needs, Online EP Load Balancer is recommended since it can adapt itself to the change in the online traffic pattern, dynamically, thus with more performance guarantees.
However, the Online EP Load Balancer faces several challenges.
First, load balancing introduces dynamic Expert placement. A single Experts location may shift based on current workload. For example, if Expert 0 and Expert 1, originally assigned to Rank 0, both become hot experts, the load balancing policy might redistribute them to different ranks alongside cold experts, which necessitates timely updates to the weight data.
We aim for the Online Load Balancer to react swiftly to changes in request patterns and adjust Expert assignments to avoid load imbalance issues. Importantly, we do not want the balancing process to interfere with the online inference execution process, nor do we want to employ a "Stop-The-World" (Bulk) strategy for updating weights.
In large MoE models (such as DeepSeek R1) during the decoding phase, batch sizes are often small, making CUDA Graph an effective acceleration method; especially when high TPS per user is required. This benefit is even more pronounced on platforms like GB200. For this reason, we want the entire load balancing mechanism to be compatible with CUDA Graph.
To avoid invalidating pre-captured CUDA Graphs, we perform in-place weight updates by writing new Expert weights into the same memory locations, rather than swapping out tensor pointers. This ensures the weights tensor address remains unchanged in the Model Engine.
In this design, each Expert Slot serves as a container for holding an Experts weights, decoupled from any specific Expert. The number of Expert Slots must be greater than or equal to the total number of Experts so that each Expert always has at least one available Slot. Hot Experts may occupy multiple Slots. Each Slot is identified by a SlotId.
Since the MoE model's routing logic outputs ExpertIds (not SlotIds), we maintain a routing table from ExpertId to SlotId which is updated by the load balancing policy, periodically. The Load Balancer Routing module uses the current routing table (Expert replication information and slots) to map each token to a suitable Expert Slot.
To make weight updates non-blocking and avoid "Stop-The-World", we use a layer-wise update approach. After a layers forward pass completes and before its next forward pass starts, we perform the weight balancing for that layer; the next forward pass for the same layer should wait until the last update is done if it happens at this iteration.
As the forward execution is typically driven by a single Python thread invoking a sequence of PyTorch operations, we offload the weight update routine to a background C++ thread. The Python side only initializes the Expert Slots and registers Expert Weights in shared host memory.
During forward execution, we insert lightweight lock/unlock kernels before and after MoE computations, as well as kernels for collecting statistics and assigning SlotIds to ExpertIds. These kernels must be short and overlap-friendly to minimize performance impact. As long as the CPU weights update thread can finish its work on time, the lock/unlock will be very short. All, except for the routing kernel, are lightweight and can easily overlap with forward kernels in different CUDA streams; the routing kernel is the primary optimization focus.
On GB200, we utilize MNNVL for inter-GPU communication during Expert dispatch and combine. Expert weights reside in host memory and are brought into GPU memory via C2C to support asynchronous updates. A multi-threaded Host Copy Engine manages this process, auto-detecting NUMA topology and choosing optimal CPU cores, enabling full asynchrony with model forward passes.
On servers without C2C but with PCIe, if cross-node communication is required, network and weight updates may compete for PCIe bandwidth, requiring additional tuning and design consideration. We have not implemented the copy engine for PCIe servers yet and it is in list of future tasks.
### Offline EP Load Balancer
Online EP balancer is more suitable for production deployment needs to react timely to online traffic changes. However, Offline EP Balancer provides a lightweight way for performance study/debugging and validation. You can refer to [this PR](https://github.com/NVIDIA/TensorRT-LLM/pull/4695) to learn more about the implementation of the Offline EP Load Balancer. Also there is a tool provided to collect statistics about the expert activation distribution which can be used as the input to deduce the EP balancing placement strategy. You can refer to [this](https://github.com/NVIDIA/TensorRT-LLM/tree/feat/large-ep/examples/ep_load_balancer#offline-ep-load-balancer) doc to learn more details as well as how how to run through the Offline EP Load Balancer in E2E approach.
## E2E evaluation
### The effect of EP Load Balancer
#### Offline EP Load Balancer
As shown by Figure 1, on the machine translation dataset, MoE layer 36 suffers from extreme expert load imbalance issues, so we use that layer to illustrate the effect of EP Load Balancer. We still run DeepSeek-R1 with 32-way expert parallelism on 32 GB200 GPUs.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture16.png" >
</figure>
</div>
<p align="center"><sub><em>Figure 16: The routed token count by receiving ranks (x-axis) and iterations (y-axis) at layer 36 (No EPLB)</em></sub></p>
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture17.png" >
</figure>
</div>
<p align="center"><sub><em>Figure 17: The routed token count by experts (x-axis) and iterations (y-axis) at layer 36 (No EPLB)</em></sub></p>
Figure 16 displays the routed token count by receiving ranks over 50 iterations, which could represent the workload for each rank. Rank 13 receives significantly more tokens than all other ranks, and such an imbalanced workload distribution is almost constant over iterations. Figure 17 breaks down the workload to experts. Clearly, two hot experts on rank 13 cause the excessive pressure on this rank.
With the above statistics, we can perform offline EPLB. One potential strategy is to maintain the 32-way expert parallelism while increasing expert slots from 8 to 9 per rank. This results in 32 redundant experts and 288 expert slots in total. Figures 18 and 19 show the routed token count after EPLB. Clearly, the per-rank token distribution is much more balanced, and there are no hot experts anymore.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture18.png" >
</figure>
</div>
<p align="center"><sub><em>Figure 18: The routed token count by receiving ranks (x-axis) and iterations (y-axis) at layer 36 (EPLB with 9 per-rank slots and EP 32)</em></sub></p>
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture19.png" >
</figure>
</div>
<p align="center"><sub><em>Figure 19: The routed token count by experts (x-axis) and iterations (y-axis) at layer 36 (EPLB with 9 per-rank slots and EP 32)</em></sub></p>
Another EPLB strategy is to maintain 8 expert slots per rank while increasing expert parallelism to 36 ways. This strategy also results in 32 redundant experts and 288 expert slots in total. As displayed by Figures 20 and 21, the workloads also become balanced across ranks or expert slots.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture20.png" >
</figure>
</div>
<p align="center"><sub><em>Figure 20: The routed token count by receiving ranks (x-axis) and iterations (y-axis) at layer 36 (EPLB with 8 per-rank slots and EP 36)</em></sub></p>
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture21.png" >
</figure>
</div>
<p align="center"><sub><em>Figure 21: The routed token count by experts (x-axis) and iterations (y-axis) at layer 36 (EPLB with 8 per-rank slots and EP 36)</em></sub></p>
For each layer and iteration, the load imbalance can be measured using simple metrics such as the standard deviation or the imbalance ratio. Given the routed token counts for all ranks (or experts), the imbalance ratio is defined as $(max - mean) / mean$, which represents the excessive workload received by the hottest rank (or expert). A perfectly balanced load would have an imbalance ratio of 0.
Table 1 reports the standard deviation and imbalance ratio for the aforementioned cases. Each number is averaged from the per-layer per-iteration metrics. Without EPLB, the load imbalance is significant -- on average, the hottest rank receives 1.56 times more routed tokens than the mean. EPLB can effectively reduced the load imbalance -- on average, the hottest rank receives only about 0.11 times more routed tokens than the mean.
| | By rank | | | By expert slot | | |
| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
| | Average | Std. Dev. | Imb. Ratio | Average | Std. Dev. | Imb. Ratio |
| No EPLB (8 per-rank slots and EP 32) | 1024 | 491.6 | 1.564 | 128 | 164.1 | 10.948 |
| EPLB (9 per-rank slots and EP 32) | 1024 | 52.0 | 0.109 | 114 | 77.8 | 1.792 |
| EPLB (8 per-rank slots and EP 36) | 1024 | 53.9 | 0.115 | 128 | 87.5 | 1.791 |
*Table 1: The standard deviation and imbalance ratio (average of per-layer and per-iteration metrics)*
#### Online EP Load Balancer
In the previous section, we demonstrated the impact of the Offline EP Load Balancer. Given our implementation of the Online EP Load Balancer, we further examine the dynamic patterns of EP balancing in online conditions.
Lets still use the machine translation dataset, DeepSeek R1 model, layer 36 (which is shown in Figure 1) as the example to understand the online behaviour:
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture22.png" >
</figure>
</div>
<p align="center"><sub><em>Figure 22: The token count sent from rank 0 to all the ranks, run on GB200, with EP32, local batch size=256, with 256 slots(no replication), so each rank hosts 8 experts</em></sub></p>
From Figure 22, it is clear that from iteration 1963, since the EPLB has taken into effect, the original hottest rank 13 is no longer the hot rank and the original workload sent to rank 13 has been redistributed to rank 0 and rank 1.
In Figure 22, only placement adjustment has been done by the Online EPLB. If we further introduce expert replication, the balancing can be further improved, as shown on the following figure:
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture23.png" >
</figure>
</div>
<p align="center"><sub><em>Figure 23: The token count sent from rank 0 to all the ranks, run on GB200, with EP32, local batch size=256, with 288 slots(with replication), so each rank hosts 9 experts</em></sub></p>
Clearly, by introducing expert replication when doing the EPLB, the EP balancing can be further improved.
Further complicated experiments can be designed to observe the Online EPLB taking into effect periodically during the online serving process to balance the EP workload in a dynamic way and we welcome the community to report any interesting EPLB pattern observation to us.
### Performance study
Note: all the representative workloads illustrated in this section are from the performance traces extracted from DeepSeek R1 inference execution. The E2E performance tuning/optimization is still ongoing and we will discuss them in the future technical blogs.
Let's use some representative workloads to illustrate the performance impact with large-scale EP.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture24.png" width="500" >
</figure>
</div>
<p align="center"><sub><em>Figure 24: EP impact over MoE Group GEMM and EP communication</em></sub></p>
In Figure 24, it can be observed that by increasing the EP size from 4 to 72, the MoE Group GEMM computation time gets reduced, while the EP communication time (for EP4/EP8 Reduce/Scatter is used, while for EP>8 All2All is used) stays almost constant.
When the EP size increases from 18 to 32, the speed-up diminishes. We are working on optimizing it.
Next, let's use some representative workloads to understand the performance impact with EPLB.
<div align="center">
<figure>
<img src="../media/tech_blog4_Picture25.png" width="500" >
</figure>
</div>
<p align="center"><sub><em>Figure 25: EPLB performance impact</em></sub></p>
Clearly in Figure 25, we can see that EPLB brings a clear performance improvement when the EP size increases, for both MoE GroupGEMM and EP communication times.
## Reproducing steps
Currently to run through the reproducing steps described in this section, please, use this [feature branch](https://github.com/NVIDIA/TensorRT-LLM/tree/feat/large-ep/tensorrt_llm). It will get merged to the main branch soon.
### The effect of EP Load Balancer
Please, refer to the [EP Load Balancer example](https://github.com/NVIDIA/TensorRT-LLM/tree/feat/large-ep/examples/ep_load_balancer) for how to reproduce the results for the offline EP Load Balancer.
##### Step 1: Run inference and collect statistics
To generate the necessary statistics for load rebalancing, run your model on a target dataset and count the routed expert IDs during inference. Once the counting process is complete, the statistics will be saved for further processing.
Set up some environment variables:
```bash
export MODEL_NAME=deepseek-ai/DeepSeek-R1
export MODEL_PATH=<YOUR_MODEL_PATH>
# Set the expert statistic data path
export EXPERT_STATISTIC_PATH=./expert_statistic
# Enable counting of routed expert IDs from iteration 100 to iteration 200
export EXPERT_STATISTIC_ITER_RANGE=100-200
```
Prepare a dataset following the [benchmarking documentation](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-benchmarking.md#preparing-a-dataset) and save it as `./dataset.json`.
Run 32-way expert parallelism inference on the prepared dataset. Please refer to the [LLM API MGMN example](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llm-api/llm_mgmn_trtllm_bench.sh) for details on running `trtllm-bench` on Slurm.
```bash
cat > ./extra_llm_api_options.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
EOF
trtllm-llmapi-launch \
trtllm-bench --model ${MODEL_NAME} \
--model_path ${MODEL_PATH} \
throughput \
--tp 32 \
--ep 32 \
--extra_llm_api_options ./extra_llm_api_options.yaml \
--kv_cache_free_gpu_mem_fraction 0.75 \
--backend pytorch \
--dataset ./dataset.json \
--warmup 0 \
--eos_id -1
```
After inference, review the dumped statistic files in `$EXPERT_STATISTIC_PATH`. Run the `examples/ep_load_balancer/report_load_statistics.py` script to show the standard deviation and imbalance ratio metrics:
```bash
python examples/ep_load_balancer/report_load_statistics.py --expert_statistic_path $EXPERT_STATISTIC_PATH
```
The output would look like:
```txt
Load statistics:
mean std imbalance-ratio
3 1024.0 187.955200 0.498043
4 1024.0 202.728516 0.537602
5 1024.0 209.339981 0.458676
...
58 1024.0 570.880676 2.461014
59 1024.0 341.339447 0.717498
60 1024.0 381.045471 1.119648
average 1024.0 491.651199 1.564272
```
##### Step 2: Generate the EPLB configuration
Use the provided `examples/ep_load_balancer/generate_eplb_config.py` script to convert the collected statistics into an EPLB configuration file. Specify the target expert parallelism size (`--ep_size`) and the total number of slots (`--num_slots`) that will be used for deployment. For example, if we choose to maintain 8 expert slots per rank while increasing expert parallelism to 36 ways, there should be 32 redundant experts and 288 expert slots in total.
```bash
python examples/ep_load_balancer/generate_eplb_config.py \
--ep_size 36 \
--num_slots 288 \
--expert_statistic_path $EXPERT_STATISTIC_PATH \
--output_path ./moe_load_balancer.yaml
```
The `./moe_load_balancer.yaml` file would look like:
```yaml
initial_global_assignments:
3: [138, 81, 60, ..., 69, 250, 77]
4: [24, 243, 72, ..., 90, 251, 52]
5: [120, 162, 246, ..., 14, 192, 171]
...
58: [67, 70, 160, ..., 212, 103, 125]
59: [45, 142, 152, ..., 99, 205, 49]
60: [34, 162, 119, ..., 234, 26, 129]
num_slots: 288
layer_updates_per_iter: 0
```
##### Step 3: Run inference with the EPLB configuration
Set up some environment variables:
```bash
# Set a new expert statistic data path
export EXPERT_STATISTIC_PATH=./expert_statistic_eplb
# Enable counting of routed expert IDs from iteration 100 to iteration 200
export EXPERT_STATISTIC_ITER_RANGE=100-200
```
Run 36-way expert parallelism inference with the EPLB configuration incorporated:
```bash
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
moe_load_balancer: ./moe_load_balancer.yaml
EOF
trtllm-llmapi-launch \
trtllm-bench --model ${MODEL_NAME} \
--model_path ${MODEL_PATH} \
throughput \
--tp 36 \
--ep 36 \
--extra_llm_api_options ./extra_llm_api_options_eplb.yaml \
--kv_cache_free_gpu_mem_fraction 0.75 \
--backend pytorch \
--dataset ./dataset.json \
--warmup 0 \
--eos_id -1
```
Run the `examples/ep_load_balancer/report_load_statistics.py` script again:
```bash
python examples/ep_load_balancer/report_load_statistics.py --expert_statistic_path $EXPERT_STATISTIC_PATH
```
The output would look like:
```txt
Load statistics:
mean std imbalance-ratio
3 1024.0 37.612328 0.081947
4 1024.0 42.367714 0.093256
5 1024.0 42.623219 0.092623
...
58 1024.0 49.167507 0.113420
59 1024.0 44.529514 0.092314
60 1024.0 48.408348 0.101029
average 1024.0 53.976442 0.115378
```
> **Note:** Counting expert IDs can significantly hurt performance, so remember to disable it by unsetting `EXPERT_STATISTIC_ITER_RANGE` when running inference for benchmarking or production purposes.
### Miscellaneous
- **GB200 NUMA binding**: Since on GB200, GPU memory are also on NUMA nodes, system can also use GPU's memory. It is suggested to use `numactl -m 0,1` to bind memory to CPU nodes if you don't want that happen.
- **Shared Memory Clean Up**: To achieve online load balance, all expert weights are stored in shared host memory. 4 ranks on same GB200 node share the same expert weights to save memory. Normally, these shared host memory will be cleaned up at process exit. But if an abnormal exit happens, they may not get chance to be cleaned. In that case, you may need to manually check `/dev/shm` directory and delete `/dev/shm/moe_shared_*` if any.
## Expanded thoughts
We deeply acknowledge the system innovation from the DeepSeek team. The introduction of the large-scale EP support into their in-house inference system and their open spirit of sharing their engineering insights with the community is extremely valuable and has already boost the performance of inference system design.
**Also we want to point out that there are no magical solutions when doing system design and optimization, such as large-scale EP.**
Based on our current performance analysis, when you plan to apply large-scale EP, you should take the following factors into considerations:
* Is the MoE GroupGEMM computation time an E2E performance bottleneck?
* Large-scale EP mainly helps reduce the MoE GroupGEMM execution time by reducing expert weight loading pressure and, thus, increases the compute intensity of the MoE GroupGEMM layer. For your workload setting, if the MoE GroupGEMM computation is not the bottleneck, then large-scale EP may not help much.
* The latency constraints.
* Large-scale EP mostly helps when there are strict latency constraints, especially on GB200/B200 with more memory capacity. For GPUs with less memory capacity, for scenarios with less latency constraints, large-scale EP can still help as it helps achieve higher concurrency and better tokens/s/GPU.
* The available HW spec.
* The optimal configuration for large-scale EP depends on GPU specifications \- including memory bandwidth, capacity, inter-GPU bandwidth, and compute power \- which determine both whether to employ large-scale EP and the ideal degree of parallelism.
* System complexity and the production deployment constraints.
* Without fault tolerance guarantee, large-scale EP can increase the online system failure ratio. Even if it is possible to do cluster level coordination to route the traffic to other running serving instances when certain large-scale EP serving instances fail, the large number of GPUs required for a single-instance deployment of large-scale EP can increase system level deployment challenges.
**In the future, we plan to summarize and share more of the best practices of deploying with large-scale EP techniques.**
**Please use your own judgement to decide whether to use large-scale EP into your system or not, and when you use it, what is the suitable EP size and concrete deployment settings suitable for your own requirements.**
The current TensorRT-LLM large-scale EP implementation is not perfect and there are still known limitations (community contributions are welcome to help us improve). For example, we need:
* More platforms coverage
* Extending the support to cover other non-GB200 NVIDIA GPU HWs. **We are actively working on this now.**
* Currently the large-EP support only covers NVFP4 data precision, incremental efforts are needed to cover FP8 and INT8/INT4 data precision.
* Performance
* Further performance tuning and optimizations. **We are actively working on this now.**
* More validation with workloads close to production traffic. **Here we highly welcome the communitys feedback to help us calibrate TensorRT-LLM large-scale EP implementation based on more concrete workloads.**
* The thorough validation of combination with other inference core features, such as dis-aggregated serving, speculative decoding, validation on more MoE model families, etc. **We are actively working on this now.**
* Ease-of-use
* Easy customization
* We believe large-scale EP can be decomposed into at least two layers:
* A core layer which developed by inference engine developers. This layer contains the customized EP communication kernels, the synchronization logic between CPU and GPU, the MoE weight re-distributed logic.
* A strategy layer which can be co-developed by the inference engine developers as well as machine learning researchers. This part contains tools to collect the online traffic statistics in different approaches, and algorithms for the optimal replication and placement of experts.
* Based on this understanding, we plan to make components close to the strategy layer easier to be extended and customized by community users. We hope to encourage better ideas to emerge.
* Based on user inputs of the deployment requirements (ISL/OSL, latency constraints, HW spec), we hope to be able to automatically recommend the best EP setting.
* Fault tolerance
* Because large-scale EP deployment solution may lead to an increased fault ratio of the online deployment system, it may increase the need for cross-layer interactions with multiple components of the E2E LLM inference system on NVIDIA GPUs. This includes the low-level communication kernel, the cluster-level orchestrator and scheduler, etc. We are actively working with various NVIDIA engineering teams to push forward on this.
We believe the current implementation can be viewed as a reasonable E2E large-scale EP implementation and we encourage the community to try new ideas and performance validation. We encourage the community to share feedback to help us move fast in this area. We are actively tracking the TensorRT-LLM large-scale EP execution in [this](https://github.com/NVIDIA/TensorRT-LLM/issues/4127) GitHub issue to ensure transparency to the community.
## Acknowledgement
The large-scale EP work is another great team effort, spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT-LLM.
Through this collaborative endeavor, we have developed valuable insights to allow us improve GPU utilization for large language model inference. We hope that the techniques and the experience shared in this blog will help the developer community to better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications.

View File

@ -89,8 +89,8 @@ class MyModel(DecoderModel):
def forward(self,
attn_metadata: AttentionMetadata,
input_ids: Optional[torch.LongTensor] = None,
position_ids: Optional[torch.LongTensor] = None,
input_ids: Optional[torch.IntTensor] = None,
position_ids: Optional[torch.IntTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None):
# Define the forward computation of the model
...

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -808,9 +808,9 @@ export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -813,9 +813,9 @@ the TensorRT-LLM C++ Executor API.</p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -680,9 +680,9 @@
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -998,9 +998,9 @@ is computed as:</p>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1038,9 +1038,9 @@ The <code class="docutils literal notranslate"><span class="pre">GptDecoder</spa
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -859,9 +859,9 @@ techniques to optimize the underlying graph. It provides a wrapper similar to P
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -764,9 +764,9 @@ An “event” is any significant change in the lifecycle or state of a KV cache
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -737,9 +737,9 @@ Assume vocabulary size is 100, which means normal text token ids are in range [0
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -851,9 +851,9 @@ The shape of <code class="docutils literal notranslate"><span class="pre">LoraWe
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -61,7 +61,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -708,9 +708,9 @@ This feature is optimized for PCIe-based GPU topologies and may affect model acc
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -839,9 +839,9 @@ However, similar to any new model, you can follow the same approach to define yo
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -61,7 +61,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -688,9 +688,9 @@ python3<span class="w"> </span>examples/summarize.py<span class="w"> </span><spa
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -750,9 +750,9 @@ python<span class="w"> </span>../summarize.py<span class="w"> </span>--engine_di
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -1017,9 +1017,9 @@ trtllm-build<span class="w"> </span>--checkpoint_dir<span class="w"> </span>./op
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -63,7 +63,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -514,22 +514,22 @@
<span id="core-concepts"></span><h1>Model Definition<a class="headerlink" href="#model-definition" title="Link to this heading">#</a></h1>
<p>TensorRT-LLM has a Model Definition API that can be used to define
Large Language Models. This API is built on top of the powerful
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html#">TensorRT Python API</a>
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/index.html">TensorRT Python API</a>
to create graph representations of deep neural networks in TensorRT. To become
familiar with the core concepts of the TensorRT API, refer to the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/coreConcepts.html">Core Concepts</a>
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/coreConcepts.html">Core Concepts</a>
section of the TensorRT documentation before proceeding further.</p>
<p>In TensorRT-LLM, the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/HEAD/tensorrt_llm/builder.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder</span></code></a> class
contains a
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Core/Builder.html#id1"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
object. That instance is used in the <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder.create_network</span></code>
method to create an instance of the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Network.html#tensorrt.INetworkDefinition"><code class="docutils literal notranslate"><span class="pre">tensorrt.INetworkDefinition</span></code></a>
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Graph/Network.html#tensorrt.INetworkDefinition"><code class="docutils literal notranslate"><span class="pre">tensorrt.INetworkDefinition</span></code></a>
class. The <code class="docutils literal notranslate"><span class="pre">INetworkDefinition</span></code> object can then be populated using the free
functions defined in the
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/HEAD/tensorrt_llm/functional.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm.functional</span></code></a>.</p>
<p>A simple example of such a free function is <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.activation</span></code> that inserts a
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Layers.html#tensorrt.IActivationLayer"><code class="docutils literal notranslate"><span class="pre">tensorrt.IActivationLayer</span></code></a>
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Graph/Layers.html#tensorrt.IActivationLayer"><code class="docutils literal notranslate"><span class="pre">tensorrt.IActivationLayer</span></code></a>
node in the graph of the model:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># In tensorrt_llm.functional:</span>
@ -558,23 +558,23 @@ functions such as the <code class="docutils literal notranslate"><span class="pr
<p>When the TensorRT-LLMs Model Definition API is utilized, a graph of the network is
assembled. The graph can later be traversed or transformed using the graph
traversal API exposed by the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/LayerBase.html#tensorrt.ILayer"><code class="docutils literal notranslate"><span class="pre">tensorrt.ILayer</span></code></a>
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Graph/LayerBase.html#tensorrt.ILayer"><code class="docutils literal notranslate"><span class="pre">tensorrt.ILayer</span></code></a>
class. That graph will also be optimized by TensorRT during the compilation of
the engine, as explained in the next section.</p>
</section>
<section id="compilation">
<h1>Compilation<a class="headerlink" href="#compilation" title="Link to this heading">#</a></h1>
<p>Once populated, the instance of the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Network.html#tensorrt.INetworkDefinition"><code class="docutils literal notranslate"><span class="pre">tensorrt.INetworkDefinition</span></code></a>,
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Graph/Network.html#tensorrt.INetworkDefinition"><code class="docutils literal notranslate"><span class="pre">tensorrt.INetworkDefinition</span></code></a>,
can be compiled into an efficient engine by the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Core/Builder.html#id1"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
In TensorRT-LLM, it is done through the <code class="docutils literal notranslate"><span class="pre">build_engine</span></code> member function of the
<code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder</span></code> class that calls the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder.build_serialized_network"><code class="docutils literal notranslate"><span class="pre">build_serialized_network</span></code></a>
[<code class="docutils literal notranslate"><span class="pre">build_serialized_network</span></code>](https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Core/Builder.html#tensorrt.Builder.build_serialized_network
method of the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/Core/Builder.html#id1"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
object. That call, if everything works as expected, produces an instance of the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/FoundationalTypes/HostMemory.html#tensorrt.IHostMemory"><code class="docutils literal notranslate"><span class="pre">tensorrt.IHostMemory</span></code></a>
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/latest/_static/python-api/infer/FoundationalTypes/HostMemory.html#tensorrt.IHostMemory"><code class="docutils literal notranslate"><span class="pre">tensorrt.IHostMemory</span></code></a>
class. That object is an optimized TensorRT engine that can be stored as a
binary file.</p>
<section id="tensorrt-compiler">
@ -1021,9 +1021,9 @@ srun<span class="w"> </span><span class="se">\</span>
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

View File

@ -51,7 +51,7 @@
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc1';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
@ -61,7 +61,7 @@
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.21.0rc0" />
<meta name="docsearch:version" content="0.21.0rc1" />
</head>
@ -939,9 +939,9 @@ The support for Qwen-1 is in <code class="docutils literal notranslate"><span cl
<div class="footer-item">
<div class="extra_footer">
<p>Last updated on June 03, 2025.</p>
<p>Last updated on June 09, 2025.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9c012d5">9c012d5</a>.</p>
</div></div>

Some files were not shown because too many files have changed in this diff Show More