diff --git a/latest/.buildinfo b/latest/.buildinfo
index 40066c9e5f..e399b071ba 100644
--- a/latest/.buildinfo
+++ b/latest/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 12c1352bd1428d2c6ac709024163b9d8
+config: 5c850ce0a6f2d0ce79a91d25fbeeb241
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/latest/_cpp_gen/executor.html b/latest/_cpp_gen/executor.html
index 20c88f06f3..f1700a377d 100644
--- a/latest/_cpp_gen/executor.html
+++ b/latest/_cpp_gen/executor.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1250,6 +1254,553 @@
 
 </dd></dl>
 
+</section>
+<section id="transferagent-h">
+<h2>transferAgent.h<a class="headerlink" href="#transferagent-h" title="Link to this heading">#</a></h2>
+<dl class="cpp type">
+<dt class="sig sig-object cpp">
+<span class="target" id="namespacetensorrt__llm"></span><span class="k"><span class="pre">namespace</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">tensorrt_llm</span></span></span><br /></dt>
+<dd><dl class="cpp type">
+<dt class="sig sig-object cpp">
+<span class="target" id="namespacetensorrt__llm_1_1executor"></span><span class="k"><span class="pre">namespace</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">executor</span></span></span><br /></dt>
+<dd><dl class="cpp type">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cacheE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cacheE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cacheE"></span><span id="tensorrt_llm::executor::kv_cache"></span><span class="target" id="namespacetensorrt__llm_1_1executor_1_1kv__cache"></span><span class="k"><span class="pre">namespace</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kv_cache</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cacheE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-typedefs">Typedefs</p>
+<dl class="cpp type">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache13TransferDescsE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache13TransferDescsE"></span><span class="target" id="transferAgent_8h_1ad1f49c49bb08248e8cd955df8292fbae"></span><span class="k"><span class="pre">using</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">TransferDescs</span></span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescsE" title="tensorrt_llm::executor::kv_cache::MemoryDescs"><span class="n"><span class="pre">MemoryDescs</span></span></a><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp type">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache13RegisterDescsE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache13RegisterDescsE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache13RegisterDescsE"></span><span class="target" id="transferAgent_8h_1a2bb86b812372815ec90e52e4d9a17099"></span><span class="k"><span class="pre">using</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">RegisterDescs</span></span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescsE" title="tensorrt_llm::executor::kv_cache::MemoryDescs"><span class="n"><span class="pre">MemoryDescs</span></span></a><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13RegisterDescsE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp type">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache11SyncMessageE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache11SyncMessageE"></span><span class="target" id="transferAgent_8h_1a2286881f67c6a7048094b5b611741cfc"></span><span class="k"><span class="pre">using</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">SyncMessage</span></span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp type">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE"></span><span class="target" id="transferAgent_8h_1ac763f2223d964bea6fc3424ea1e66896"></span><span class="k"><span class="pre">using</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">ConnectionInfoType</span></span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-enums">Enums</p>
+<dl class="cpp enum-class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryTypeE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryTypeE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryTypeE"></span><span class="target" id="transferAgent_8h_1a3c82e9ebcab35b8ab4d39e16f4f9039a"></span><span class="k"><span class="pre">enum</span></span><span class="w"> </span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">MemoryType</span></span></span><span class="w"> </span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">uint8_t</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryTypeE" title="Link to this definition">#</a><br /></dt>
+<dd><p><em>Values:</em></p>
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kDRAME">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryType5kDRAME"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryType5kDRAME"></span><span class="target" id="transferAgent_8h_1a3c82e9ebcab35b8ab4d39e16f4f9039aa42114399bc430c192559868559876494"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kDRAM</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kDRAME" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kVRAME">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryType5kVRAME"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryType5kVRAME"></span><span class="target" id="transferAgent_8h_1a3c82e9ebcab35b8ab4d39e16f4f9039aa1553fa1962a86fec3af0c6d1f2cb34f0"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kVRAM</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kVRAME" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kBLKE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryType4kBLKE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryType4kBLKE"></span><span class="target" id="transferAgent_8h_1a3c82e9ebcab35b8ab4d39e16f4f9039aa9a08b41ce2bbaa0878f2b23970ab01b0"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kBLK</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kBLKE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kOBJE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryType4kOBJE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryType4kOBJE"></span><span class="target" id="transferAgent_8h_1a3c82e9ebcab35b8ab4d39e16f4f9039aa04ae6375ff7dd60354d217d706198112"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kOBJ</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kOBJE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kFILEE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryType5kFILEE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryType5kFILEE"></span><span class="target" id="transferAgent_8h_1a3c82e9ebcab35b8ab4d39e16f4f9039aa4f5e30bd18513f0849246100edf4b267"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kFILE</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kFILEE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</dd></dl>
+
+<dl class="cpp enum-class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOpE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10TransferOpE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10TransferOpE"></span><span class="target" id="transferAgent_8h_1a3e6174d68fd0641f72787ca2b45a0fee"></span><span class="k"><span class="pre">enum</span></span><span class="w"> </span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">TransferOp</span></span></span><span class="w"> </span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">uint8_t</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOpE" title="Link to this definition">#</a><br /></dt>
+<dd><p><em>Values:</em></p>
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp5kREADE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10TransferOp5kREADE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10TransferOp5kREADE"></span><span class="target" id="transferAgent_8h_1a3e6174d68fd0641f72787ca2b45a0feea8fbb854b62e34a1f77d600f286f5d449"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kREAD</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp5kREADE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp6kWRITEE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10TransferOp6kWRITEE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10TransferOp6kWRITEE"></span><span class="target" id="transferAgent_8h_1a3e6174d68fd0641f72787ca2b45a0feea9b3031051108ec0a493a1c56c664a6f3"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kWRITE</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp6kWRITEE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-functions">Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args">
+<span id="_CPPv3IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentERKNSt6stringEDpRR4Args"></span><span id="_CPPv2IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentERKNSt6stringEDpRR4Args"></span><span class="k"><span class="pre">template</span></span><span class="p"><span class="pre">&lt;</span></span><span class="k"><span class="pre">typename</span></span><span class="w"> </span><span class="p"><span class="pre">...</span></span><span class="sig-name descname sig-name-template"><span class="n"><span class="pre">Args</span></span></span><span class="p"><span class="pre">&gt;</span></span><br /><span class="target" id="transferAgent_8h_1a962b2ba4955f3a9e8f0da6eaca718077"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">unique_ptr</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentE" title="tensorrt_llm::executor::kv_cache::BaseTransferAgent"><span class="n"><span class="pre">BaseTransferAgent</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">makeTransferAgent</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">backend</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args" title="tensorrt_llm::executor::kv_cache::makeTransferAgent::Args"><span class="n"><span class="pre">Args</span></span></a><span class="p"><span class="pre">&amp;</span></span><span class="p"><span class="pre">&amp;</span></span><span class="p"><span class="pre">...</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">args</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<dl class="cpp class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDescE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache9AgentDescE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache9AgentDescE"></span><span id="tensorrt_llm::executor::kv_cache::AgentDesc"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentDesc"></span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">AgentDesc</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDescE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc9AgentDescENSt6stringE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache9AgentDesc9AgentDescENSt6stringE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache9AgentDesc9AgentDescENSt6stringE"></span><span id="tensorrt_llm::executor::kv_cache::AgentDesc::AgentDesc__ss"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentDesc_1a48dcdf4866378fb0c32b23ba8af5579a"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">AgentDesc</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">backendAgentDesc</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc9AgentDescENSt6stringE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache9AgentDesc19getBackendAgentDescEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache9AgentDesc19getBackendAgentDescEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache9AgentDesc19getBackendAgentDescEv"></span><span id="tensorrt_llm::executor::kv_cache::AgentDesc::getBackendAgentDescC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentDesc_1a14f0346b57a5ad1b9f609a1e1c96c0ae"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getBackendAgentDesc</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9AgentDesc19getBackendAgentDescEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-private-members">Private Members</p>
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc17mBackendAgentDescE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache9AgentDesc17mBackendAgentDescE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache9AgentDesc17mBackendAgentDescE"></span><span id="tensorrt_llm::executor::kv_cache::AgentDesc::mBackendAgentDesc__ss"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentDesc_1ae3fc107c8657064a17abac3b3f0f585f"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mBackendAgentDesc</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc17mBackendAgentDescE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+</dd></dl>
+
+<dl class="cpp struct">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfigE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15BaseAgentConfigE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15BaseAgentConfigE"></span><span id="tensorrt_llm::executor::kv_cache::BaseAgentConfig"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseAgentConfig"></span><span class="k"><span class="pre">struct</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">BaseAgentConfig</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfigE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-members">Public Members</p>
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig5mNameE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15BaseAgentConfig5mNameE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15BaseAgentConfig5mNameE"></span><span id="tensorrt_llm::executor::kv_cache::BaseAgentConfig::mName__ss"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseAgentConfig_1a2936d275df3a561da7588cd2c1cf28ec"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mName</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig5mNameE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig13useProgThreadE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15BaseAgentConfig13useProgThreadE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15BaseAgentConfig13useProgThreadE"></span><span id="tensorrt_llm::executor::kv_cache::BaseAgentConfig::useProgThread__b"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseAgentConfig_1a314e831a12e6b318d60425b3dc699813"></span><span class="kt"><span class="pre">bool</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">useProgThread</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig13useProgThreadE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+</dd></dl>
+
+<dl class="cpp class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgentE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgentE"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent"></span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">BaseTransferAgent</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentD0Ev">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgentD0Ev"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgentD0Ev"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::~BaseTransferAgent"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1ac38476f4d667e6959a62992548730e72"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">~BaseTransferAgent</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="k"><span class="pre">default</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentD0Ev" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent14registerMemoryERK13RegisterDescs">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent14registerMemoryERK13RegisterDescs"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent14registerMemoryERK13RegisterDescs"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::registerMemory__RegisterDescsCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1af2ab4f613dbe8856dc215e64f327136a"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">registerMemory</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13RegisterDescsE" title="tensorrt_llm::executor::kv_cache::RegisterDescs"><span class="n"><span class="pre">RegisterDescs</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">descs</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent14registerMemoryERK13RegisterDescs" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16deregisterMemoryERK13RegisterDescs">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16deregisterMemoryERK13RegisterDescs"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16deregisterMemoryERK13RegisterDescs"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::deregisterMemory__RegisterDescsCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1aa55facd04a0995f9f1837db545b4cb94"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">deregisterMemory</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13RegisterDescsE" title="tensorrt_llm::executor::kv_cache::RegisterDescs"><span class="n"><span class="pre">RegisterDescs</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">descs</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16deregisterMemoryERK13RegisterDescs" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::loadRemoteAgent__ssCR.AgentDescCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1a1447916398ed57751cb0773875e35b55"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">loadRemoteAgent</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">name</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDescE" title="tensorrt_llm::executor::kv_cache::AgentDesc"><span class="n"><span class="pre">AgentDesc</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">agentDesc</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getLocalAgentDescEv">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getLocalAgentDescEv"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getLocalAgentDescEv"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::getLocalAgentDesc"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1abd9748104966f66cde9a1be618487abb"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDescE" title="tensorrt_llm::executor::kv_cache::AgentDesc"><span class="n"><span class="pre">AgentDesc</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getLocalAgentDesc</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getLocalAgentDescEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent21invalidateRemoteAgentERKNSt6stringE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent21invalidateRemoteAgentERKNSt6stringE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent21invalidateRemoteAgentERKNSt6stringE"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::invalidateRemoteAgent__ssCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1ac347f34b38bb87755efe08b7d64bb01c"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">invalidateRemoteAgent</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">name</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent21invalidateRemoteAgentERKNSt6stringE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent22submitTransferRequestsERK15TransferRequest">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent22submitTransferRequestsERK15TransferRequest"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent22submitTransferRequestsERK15TransferRequest"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::submitTransferRequests__TransferRequestCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1aebf9717ab007f261b7006197de0bee73"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">unique_ptr</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusE" title="tensorrt_llm::executor::kv_cache::TransferStatus"><span class="n"><span class="pre">TransferStatus</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">submitTransferRequests</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequestE" title="tensorrt_llm::executor::kv_cache::TransferRequest"><span class="n"><span class="pre">TransferRequest</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">request</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent22submitTransferRequestsERK15TransferRequest" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::notifySyncMessage__ssCR.SyncMessageCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1a5fa37e2a12de2bb6de39c5ac57b1a020"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">notifySyncMessage</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">name</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE" title="tensorrt_llm::executor::kv_cache::SyncMessage"><span class="n"><span class="pre">SyncMessage</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">syncMessage</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent23getNotifiedSyncMessagesEv">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent23getNotifiedSyncMessagesEv"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent23getNotifiedSyncMessagesEv"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::getNotifiedSyncMessages"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1a8b84bb623ba08c93c850f7909e866441"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">unordered_map</span></span><span class="p"><span class="pre">&lt;</span></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE" title="tensorrt_llm::executor::kv_cache::SyncMessage"><span class="n"><span class="pre">SyncMessage</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getNotifiedSyncMessages</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+</dl>
+
+<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent23getNotifiedSyncMessagesEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getConnectionInfoEv">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getConnectionInfoEv"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getConnectionInfoEv"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::getConnectionInfo"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1a2387ae36bb9e0ad8fc08a61e0ae0b528"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE" title="tensorrt_llm::executor::kv_cache::ConnectionInfoType"><span class="n"><span class="pre">ConnectionInfoType</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getConnectionInfo</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getConnectionInfoEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::connectRemoteAgent__ssCR.ConnectionInfoTypeCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1a9ab249cb9287d3958c18c252f5ae2353"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">connectRemoteAgent</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">name</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE" title="tensorrt_llm::executor::kv_cache::ConnectionInfoType"><span class="n"><span class="pre">ConnectionInfoType</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">connectionInfo</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs"></span><span id="tensorrt_llm::executor::kv_cache::BaseTransferAgent::checkRemoteDescs__ssCR.MemoryDescsCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1BaseTransferAgent_1a2b391691d49d70cb97915f3d336d6ef3"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">bool</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">checkRemoteDescs</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">name</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescsE" title="tensorrt_llm::executor::kv_cache::MemoryDescs"><span class="n"><span class="pre">MemoryDescs</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">memoryDescs</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+</dd></dl>
+
+<dl class="cpp class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoaderE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoaderE"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader"></span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">DynLibLoader</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9getHandleERKNSt6stringE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoader9getHandleERKNSt6stringE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoader9getHandleERKNSt6stringE"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader::getHandle__ssCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1ac53d5bc596a947fa23a4b223bd6e96ad"></span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="p"><span class="pre">*</span></span><span class="sig-name descname"><span class="n"><span class="pre">getHandle</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">name</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9getHandleERKNSt6stringE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE">
+<span id="_CPPv3I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerERKNSt6stringERKNSt6stringE"></span><span id="_CPPv2I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerERKNSt6stringERKNSt6stringE"></span><span class="k"><span class="pre">template</span></span><span class="p"><span class="pre">&lt;</span></span><span class="k"><span class="pre">typename</span></span><span class="w"> </span><span class="sig-name descname sig-name-template"><span class="n"><span class="pre">FunctionT</span></span></span><span class="p"><span class="pre">&gt;</span></span><br /><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1aa120a1793a0add730f8f8a3b4a3fdb02"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE" title="tensorrt_llm::executor::kv_cache::DynLibLoader::getFunctionPointer::FunctionT"><span class="n"><span class="pre">FunctionT</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getFunctionPointer</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">libName</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">funcName</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderD0Ev">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoaderD0Ev"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoaderD0Ev"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader::~DynLibLoader"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1afc8ec9ba9f94e6f4d9f92dad576ef78c"></span><span class="sig-name descname"><span class="n"><span class="pre">~DynLibLoader</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderD0Ev" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderEv">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderEv"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderEv"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader::DynLibLoader"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1a7fef19629812725c387457b230b2a18b"></span><span class="sig-name descname"><span class="n"><span class="pre">DynLibLoader</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="k"><span class="pre">default</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderERK12DynLibLoader">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderERK12DynLibLoader"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderERK12DynLibLoader"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader::DynLibLoader__DynLibLoaderCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1a40ba67066154f109542a91dc8dc53224"></span><span class="sig-name descname"><span class="n"><span class="pre">DynLibLoader</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderERK12DynLibLoader" title="tensorrt_llm::executor::kv_cache::DynLibLoader::DynLibLoader"><span class="n"><span class="pre">DynLibLoader</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="p"><span class="pre">&amp;</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="k"><span class="pre">delete</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderERK12DynLibLoader" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderaSERK12DynLibLoader">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoaderaSERK12DynLibLoader"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoaderaSERK12DynLibLoader"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader::assign-operator__DynLibLoaderCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1af753fc3984edb13b76ed8c2a3d4c0e95"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderE" title="tensorrt_llm::executor::kv_cache::DynLibLoader"><span class="n"><span class="pre">DynLibLoader</span></span></a><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="k"><span class="pre">operator</span></span><span class="o"><span class="pre">=</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderE" title="tensorrt_llm::executor::kv_cache::DynLibLoader"><span class="n"><span class="pre">DynLibLoader</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="p"><span class="pre">&amp;</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="k"><span class="pre">delete</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderaSERK12DynLibLoader" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-static-functions">Public Static Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader11getInstanceEv">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoader11getInstanceEv"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoader11getInstanceEv"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader::getInstance"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1a536b9e15fe4aac0e3e3965376f9e7655"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderE" title="tensorrt_llm::executor::kv_cache::DynLibLoader"><span class="n"><span class="pre">DynLibLoader</span></span></a><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getInstance</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader11getInstanceEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-private-members">Private Members</p>
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mDllMutexE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoader9mDllMutexE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoader9mDllMutexE"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader::mDllMutex__std::mutex"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1a8fde9ddc597323cbf44e3374b352cdb9"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">mutex</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mDllMutex</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mDllMutexE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mHandlersE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoader9mHandlersE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoader9mHandlersE"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader::mHandlers__std::unordered_map:ss.voidP:"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1aa62953ffd11b8b0094a999170bcb964b"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">unordered_map</span></span><span class="p"><span class="pre">&lt;</span></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="p"><span class="pre">*</span></span><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mHandlers</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mHandlersE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-private-static-functions">Private Static Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc"></span><span id="tensorrt_llm::executor::kv_cache::DynLibLoader::dlSym__voidP.cCP"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1DynLibLoader_1a968ec20ae0e3b5aa0c2d138b66f299ff"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="p"><span class="pre">*</span></span><span class="sig-name descname"><span class="n"><span class="pre">dlSym</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="p"><span class="pre">*</span></span><span class="n sig-param"><span class="pre">handle</span></span></em>, <em class="sig-param"><span class="kt"><span class="pre">char</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">*</span></span><span class="n sig-param"><span class="pre">symbol</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+</dd></dl>
+
+<dl class="cpp class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDescE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDescE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc"></span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">MemoryDesc</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc__std::vector:c:CR.uint32_t"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a2a0d8735dd403faea98e2774904ae876"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">MemoryDesc</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><span class="kt"><span class="pre">char</span></span><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">vec</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">uint32_t</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">deviceId</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc__voidP.s.uint32_t"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1ad9be073c41d131586b2f83096ea5ed42"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">MemoryDesc</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="p"><span class="pre">*</span></span><span class="n sig-param"><span class="pre">addr</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size_t</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">len</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">uint32_t</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">deviceId</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc__uintptr_t.s.uint32_t"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a02deebfb2875dc0ad55524ea456c5beb"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">MemoryDesc</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">uintptr_t</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">addr</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">size_t</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">len</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">uint32_t</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">deviceId</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc7getAddrEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache10MemoryDesc7getAddrEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache10MemoryDesc7getAddrEv"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::getAddrC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a4a74dbbcf3978170afa7d01070084041"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">uintptr_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getAddr</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc7getAddrEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc6getLenEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache10MemoryDesc6getLenEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache10MemoryDesc6getLenEv"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::getLenC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a41dbdc2f221c6f79b3b5570ecfff5b60"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">size_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getLen</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc6getLenEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc11getDeviceIdEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache10MemoryDesc11getDeviceIdEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache10MemoryDesc11getDeviceIdEv"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::getDeviceIdC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a0133ed4bdf8ffd4323d335b7fe530e8a"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">uint32_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getDeviceId</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc11getDeviceIdEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-static-functions">Public Static Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::serialize__MemoryDescCR.osR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a3a98dd704a4bf7023c32032a69182558"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">serialize</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE" title="tensorrt_llm::executor::kv_cache::MemoryDesc"><span class="n"><span class="pre">MemoryDesc</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">memoryDesc</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">ostream</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">os</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc11deserializeERNSt7istreamE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDesc11deserializeERNSt7istreamE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDesc11deserializeERNSt7istreamE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::deserialize__isR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a097411ed09a8a12dcaee26bbed268764"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE" title="tensorrt_llm::executor::kv_cache::MemoryDesc"><span class="n"><span class="pre">MemoryDesc</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">deserialize</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">istream</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">is</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc11deserializeERNSt7istreamE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc14serializedSizeERK10MemoryDesc">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDesc14serializedSizeERK10MemoryDesc"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDesc14serializedSizeERK10MemoryDesc"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::serializedSize__MemoryDescCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a195e62a86d381e190e1525306a240890"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><span class="n"><span class="pre">size_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">serializedSize</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE" title="tensorrt_llm::executor::kv_cache::MemoryDesc"><span class="n"><span class="pre">MemoryDesc</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">memoryDesc</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc14serializedSizeERK10MemoryDesc" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-private-members">Private Members</p>
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc5mAddrE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDesc5mAddrE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDesc5mAddrE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::mAddr__uintptr_t"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a4e60eb382918f123f11e6db8fdb3c943"></span><span class="n"><span class="pre">uintptr_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mAddr</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc5mAddrE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc4mLenE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDesc4mLenE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDesc4mLenE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::mLen__s"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a4776ae22b3922505e55eaf4f278d5143"></span><span class="n"><span class="pre">size_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mLen</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc4mLenE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9mDeviceIdE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10MemoryDesc9mDeviceIdE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10MemoryDesc9mDeviceIdE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDesc::mDeviceId__uint32_t"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDesc_1a3365a3c18600915e57e9e034cef567ee"></span><span class="n"><span class="pre">uint32_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mDeviceId</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9mDeviceIdE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+</dd></dl>
+
+<dl class="cpp class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescsE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache11MemoryDescsE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache11MemoryDescsE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDescs"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDescs"></span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">MemoryDescs</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescsE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDescs::MemoryDescs__MemoryType.std::vector:MemoryDesc:"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDescs_1a8295bed464d811c027ce4691a0e15cd6"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">MemoryDescs</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryTypeE" title="tensorrt_llm::executor::kv_cache::MemoryType"><span class="n"><span class="pre">MemoryType</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">type</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE" title="tensorrt_llm::executor::kv_cache::MemoryDesc"><span class="n"><span class="pre">MemoryDesc</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">descs</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs7getTypeEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache11MemoryDescs7getTypeEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache11MemoryDescs7getTypeEv"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDescs::getTypeC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDescs_1a111f124275f834d2387b2df5432b71a9"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryTypeE" title="tensorrt_llm::executor::kv_cache::MemoryType"><span class="n"><span class="pre">MemoryType</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getType</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs7getTypeEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs8getDescsEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache11MemoryDescs8getDescsEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache11MemoryDescs8getDescsEv"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDescs::getDescsC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDescs_1a4e42d94b90a4a5b95e896c533721ae1b"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE" title="tensorrt_llm::executor::kv_cache::MemoryDesc"><span class="n"><span class="pre">MemoryDesc</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getDescs</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs8getDescsEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-private-members">Private Members</p>
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs5mTypeE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache11MemoryDescs5mTypeE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache11MemoryDescs5mTypeE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDescs::mType__MemoryType"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDescs_1afe754835f089dd28d67bec3db8c79518"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryTypeE" title="tensorrt_llm::executor::kv_cache::MemoryType"><span class="n"><span class="pre">MemoryType</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mType</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs5mTypeE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs6mDescsE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache11MemoryDescs6mDescsE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache11MemoryDescs6mDescsE"></span><span id="tensorrt_llm::executor::kv_cache::MemoryDescs::mDescs__std::vector:MemoryDesc:"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1MemoryDescs_1ae7d74ba13fb6f4f05c72609162553738"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE" title="tensorrt_llm::executor::kv_cache::MemoryDesc"><span class="n"><span class="pre">MemoryDesc</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mDescs</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs6mDescsE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+</dd></dl>
+
+<dl class="cpp class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequestE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15TransferRequestE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15TransferRequestE"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest"></span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">TransferRequest</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequestE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::TransferRequest__TransferOp.TransferDescs.TransferDescs.ssCR.std::optional:SyncMessage:"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1a3f6d832fe6fba6180aaac43a08b8c262"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">TransferRequest</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOpE" title="tensorrt_llm::executor::kv_cache::TransferOp"><span class="n"><span class="pre">TransferOp</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">op</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE" title="tensorrt_llm::executor::kv_cache::TransferDescs"><span class="n"><span class="pre">TransferDescs</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">srcDescs</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE" title="tensorrt_llm::executor::kv_cache::TransferDescs"><span class="n"><span class="pre">TransferDescs</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">dstDescs</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">remoteName</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">optional</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE" title="tensorrt_llm::executor::kv_cache::SyncMessage"><span class="n"><span class="pre">SyncMessage</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">syncMessage</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">nullopt</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest5getOpEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache15TransferRequest5getOpEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache15TransferRequest5getOpEv"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::getOpC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1ac533b6c1c1b8c5397ce8e25833b26158"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOpE" title="tensorrt_llm::executor::kv_cache::TransferOp"><span class="n"><span class="pre">TransferOp</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getOp</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest5getOpEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getSrcDescsEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache15TransferRequest11getSrcDescsEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache15TransferRequest11getSrcDescsEv"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::getSrcDescsC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1a6a6eb8487a43ecb153502a7a09dad96e"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE" title="tensorrt_llm::executor::kv_cache::TransferDescs"><span class="n"><span class="pre">TransferDescs</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getSrcDescs</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getSrcDescsEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getDstDescsEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache15TransferRequest11getDstDescsEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache15TransferRequest11getDstDescsEv"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::getDstDescsC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1ac86417f2f0dcd9dbdfc71c9db133b879"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE" title="tensorrt_llm::executor::kv_cache::TransferDescs"><span class="n"><span class="pre">TransferDescs</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getDstDescs</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getDstDescsEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest13getRemoteNameEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache15TransferRequest13getRemoteNameEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache15TransferRequest13getRemoteNameEv"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::getRemoteNameC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1ace277e9971c3d7a09074d818324bfb71"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getRemoteName</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest13getRemoteNameEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest14getSyncMessageEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache15TransferRequest14getSyncMessageEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache15TransferRequest14getSyncMessageEv"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::getSyncMessageC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1a8d8c48b778b7abb203f545502d280399"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">optional</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE" title="tensorrt_llm::executor::kv_cache::SyncMessage"><span class="n"><span class="pre">SyncMessage</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getSyncMessage</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest14getSyncMessageEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-private-members">Private Members</p>
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest3mOpE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15TransferRequest3mOpE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15TransferRequest3mOpE"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::mOp__TransferOp"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1a206d45cdbe53b9a4f280c901b51557f3"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOpE" title="tensorrt_llm::executor::kv_cache::TransferOp"><span class="n"><span class="pre">TransferOp</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mOp</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest3mOpE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mSrcDescsE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15TransferRequest9mSrcDescsE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15TransferRequest9mSrcDescsE"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::mSrcDescs__TransferDescs"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1abce69416e78057dc63235fefd45e7cdb"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE" title="tensorrt_llm::executor::kv_cache::TransferDescs"><span class="n"><span class="pre">TransferDescs</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mSrcDescs</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mSrcDescsE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mDstDescsE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15TransferRequest9mDstDescsE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15TransferRequest9mDstDescsE"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::mDstDescs__TransferDescs"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1a1843d8b65374bbe93e8c6d05ead25059"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE" title="tensorrt_llm::executor::kv_cache::TransferDescs"><span class="n"><span class="pre">TransferDescs</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mDstDescs</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mDstDescsE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest11mRemoteNameE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15TransferRequest11mRemoteNameE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15TransferRequest11mRemoteNameE"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::mRemoteName__ss"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1ac7a5fcb8ee1ec8505f8057fdf1b69339"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mRemoteName</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest11mRemoteNameE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest12mSyncMessageE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache15TransferRequest12mSyncMessageE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache15TransferRequest12mSyncMessageE"></span><span id="tensorrt_llm::executor::kv_cache::TransferRequest::mSyncMessage__std::optional:SyncMessage:"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferRequest_1a4e3eb7e8611e553a56c30ea472821854"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">optional</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE" title="tensorrt_llm::executor::kv_cache::SyncMessage"><span class="n"><span class="pre">SyncMessage</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mSyncMessage</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest12mSyncMessageE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+</dd></dl>
+
+<dl class="cpp class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache14TransferStatusE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache14TransferStatusE"></span><span id="tensorrt_llm::executor::kv_cache::TransferStatus"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferStatus"></span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">TransferStatus</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusD0Ev">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache14TransferStatusD0Ev"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache14TransferStatusD0Ev"></span><span id="tensorrt_llm::executor::kv_cache::TransferStatus::~TransferStatus"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferStatus_1a5875c08c018ed556bbb048bd71d4667a"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">~TransferStatus</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="k"><span class="pre">default</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusD0Ev" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus11isCompletedEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache14TransferStatus11isCompletedEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache14TransferStatus11isCompletedEv"></span><span id="tensorrt_llm::executor::kv_cache::TransferStatus::isCompletedC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferStatus_1a0855f8e280bf6d0357c22a08d7cb79a5"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">bool</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">isCompleted</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus11isCompletedEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus4waitEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache14TransferStatus4waitEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache14TransferStatus4waitEv"></span><span id="tensorrt_llm::executor::kv_cache::TransferStatus::waitC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1TransferStatus_1a3295b58ae616e14c205b802e719c8b15"></span><span class="k"><span class="pre">virtual</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">wait</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus4waitEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+</dd></dl>
+
+</dd></dl>
+
+</dd></dl>
+
+</dd></dl>
+
 </section>
 <section id="serialization-h">
 <h2>serialization.h<a class="headerlink" href="#serialization-h" title="Link to this heading">#</a></h2>
@@ -1514,6 +2065,28 @@
 <span id="_CPPv3N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE"></span><span id="_CPPv2N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE"></span><span id="tensorrt_llm::executor::Serialization::serializedSize__kv_cache::SocketStateCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1Serialization_1ab85a5c8fd7ec4c2ff14cb51b738b71d1"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><span class="n"><span class="pre">size_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">serializedSize</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cacheE" title="tensorrt_llm::executor::kv_cache"><span class="n"><span class="pre">kv_cache</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SocketStateE" title="tensorrt_llm::executor::kv_cache::SocketState"><span class="n"><span class="pre">SocketState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">state</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor13Serialization21deserializeAgentStateERNSt7istreamE">
+<span id="_CPPv3N12tensorrt_llm8executor13Serialization21deserializeAgentStateERNSt7istreamE"></span><span id="_CPPv2N12tensorrt_llm8executor13Serialization21deserializeAgentStateERNSt7istreamE"></span><span id="tensorrt_llm::executor::Serialization::deserializeAgentState__isR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1Serialization_1a6757259d146f076e9d58600a8af3cee1"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cacheE" title="tensorrt_llm::executor::kv_cache"><span class="n"><span class="pre">kv_cache</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE" title="tensorrt_llm::executor::kv_cache::AgentState"><span class="n"><span class="pre">AgentState</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">deserializeAgentState</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">istream</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">is</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor13Serialization21deserializeAgentStateERNSt7istreamE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE">
+<span id="_CPPv3N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE"></span><span id="_CPPv2N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE"></span><span id="tensorrt_llm::executor::Serialization::serialize__kv_cache::AgentStateCR.osR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1Serialization_1a09a40a9e6b52fc6a82c33cddbeac9e57"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">serialize</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cacheE" title="tensorrt_llm::executor::kv_cache"><span class="n"><span class="pre">kv_cache</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE" title="tensorrt_llm::executor::kv_cache::AgentState"><span class="n"><span class="pre">AgentState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">state</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">ostream</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">os</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10AgentStateE">
+<span id="_CPPv3N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10AgentStateE"></span><span id="_CPPv2N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10AgentStateE"></span><span id="tensorrt_llm::executor::Serialization::serializedSize__kv_cache::AgentStateCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1Serialization_1a398a25ea47e1688bf59939c85a53d4ff"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><span class="n"><span class="pre">size_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">serializedSize</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cacheE" title="tensorrt_llm::executor::kv_cache"><span class="n"><span class="pre">kv_cache</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE" title="tensorrt_llm::executor::kv_cache::AgentState"><span class="n"><span class="pre">AgentState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">state</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10AgentStateE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp function">
 <dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE">
 <span id="_CPPv3N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE"></span><span id="_CPPv2N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE"></span><span id="tensorrt_llm::executor::Serialization::deserializeCacheState__isR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1Serialization_1a0703b669e35401e746cfa9a4ebe63ae2"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cacheE" title="tensorrt_llm::executor::kv_cache"><span class="n"><span class="pre">kv_cache</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10CacheStateE" title="tensorrt_llm::executor::kv_cache::CacheState"><span class="n"><span class="pre">CacheState</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">deserializeCacheState</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">istream</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">is</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE" title="Link to this definition">#</a><br /></dt>
@@ -2398,6 +2971,34 @@
 <span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor13Serialization28deserializeSpecDecodingStatsERNSt7istreamE">
+<span id="_CPPv3N12tensorrt_llm8executor13Serialization28deserializeSpecDecodingStatsERNSt7istreamE"></span><span id="_CPPv2N12tensorrt_llm8executor13Serialization28deserializeSpecDecodingStatsERNSt7istreamE"></span><span id="tensorrt_llm::executor::Serialization::deserializeSpecDecodingStats__isR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1Serialization_1a08c01ef4092ee77ba37d30a56e7a567c"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE" title="tensorrt_llm::executor::SpecDecodingStats"><span class="n"><span class="pre">SpecDecodingStats</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">deserializeSpecDecodingStats</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">istream</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">is</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor13Serialization28deserializeSpecDecodingStatsERNSt7istreamE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE">
+<span id="_CPPv3N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE"></span><span id="_CPPv2N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE"></span><span id="tensorrt_llm::executor::Serialization::serialize__SpecDecodingStatsCR.osR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1Serialization_1a6f15b088fba6d48faa1bff296326bb8e"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><span class="kt"><span class="pre">void</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">serialize</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE" title="tensorrt_llm::executor::SpecDecodingStats"><span class="n"><span class="pre">SpecDecodingStats</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">specDecStats</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">ostream</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">os</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK17SpecDecodingStats">
+<span id="_CPPv3N12tensorrt_llm8executor13Serialization14serializedSizeERK17SpecDecodingStats"></span><span id="_CPPv2N12tensorrt_llm8executor13Serialization14serializedSizeERK17SpecDecodingStats"></span><span id="tensorrt_llm::executor::Serialization::serializedSize__SpecDecodingStatsCR"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1Serialization_1a1d3e364fe8e5cabe5371766da8e5dbf0"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><span class="n"><span class="pre">size_t</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">serializedSize</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE" title="tensorrt_llm::executor::SpecDecodingStats"><span class="n"><span class="pre">SpecDecodingStats</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">specDecStats</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK17SpecDecodingStats" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp function">
 <dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE">
 <span id="_CPPv3N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE"></span><span id="_CPPv2N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE"></span><span id="tensorrt_llm::executor::Serialization::deserializeIterationStats__std::vector:c:R"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1Serialization_1a9797c0d1af10c396b36f548de7d2e8e2"></span><span class="k"><span class="pre">static</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor14IterationStatsE" title="tensorrt_llm::executor::IterationStats"><span class="n"><span class="pre">IterationStats</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">deserializeIterationStats</span></span></span><span class="sig-paren">(</span>
@@ -2620,8 +3221,8 @@
 </dd></dl>
 
 <dl class="cpp type">
-<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cacheE">
-<span id="_CPPv3N12tensorrt_llm8executor8kv_cacheE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cacheE"></span><span id="tensorrt_llm::executor::kv_cache"></span><span class="target" id="namespacetensorrt__llm_1_1executor_1_1kv__cache"></span><span class="k"><span class="pre">namespace</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kv_cache</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cacheE" title="Link to this definition">#</a><br /></dt>
+<dt class="sig sig-object cpp">
+<span class="target" id="namespacetensorrt__llm_1_1executor_1_1kv__cache"></span><span class="k"><span class="pre">namespace</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kv_cache</span></span></span><br /></dt>
 <dd></dd></dl>
 
 </dd></dl>
@@ -2649,6 +3250,11 @@
 <span id="_CPPv3N12tensorrt_llm8executor10SizeType32E"></span><span id="_CPPv2N12tensorrt_llm8executor10SizeType32E"></span><span class="target" id="types_8h_1ad818c2e487265ea3ec0ddd760b768085"></span><span class="k"><span class="pre">using</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">SizeType32</span></span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">int32_t</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor10SizeType32E" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp type">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor10SizeType64E">
+<span id="_CPPv3N12tensorrt_llm8executor10SizeType64E"></span><span id="_CPPv2N12tensorrt_llm8executor10SizeType64E"></span><span class="target" id="types_8h_1acda8a22d5fd4b8f6f92ce04c779cf088"></span><span class="k"><span class="pre">using</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">SizeType64</span></span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">int64_t</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor10SizeType64E" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp type">
 <dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor9FloatTypeE">
 <span id="_CPPv3N12tensorrt_llm8executor9FloatTypeE"></span><span id="_CPPv2N12tensorrt_llm8executor9FloatTypeE"></span><span class="target" id="types_8h_1a48053cc72a5a67b3c19c817fb963ecea"></span><span class="k"><span class="pre">using</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">FloatType</span></span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="kt"><span class="pre">float</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor9FloatTypeE" title="Link to this definition">#</a><br /></dt>
@@ -3045,6 +3651,31 @@
 
 </dd></dl>
 
+<dl class="cpp enum-class">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE">
+<span id="_CPPv3N12tensorrt_llm8executor19KvCacheTransferModeE"></span><span id="_CPPv2N12tensorrt_llm8executor19KvCacheTransferModeE"></span><span class="target" id="types_8h_1a345a3e90232624cc0c5adf7896ca273e"></span><span class="k"><span class="pre">enum</span></span><span class="w"> </span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">KvCacheTransferMode</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Enum describing the transfer mode for KV cache. </p>
+<p><em>Values:</em></p>
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode4DRAME">
+<span id="_CPPv3N12tensorrt_llm8executor19KvCacheTransferMode4DRAME"></span><span id="_CPPv2N12tensorrt_llm8executor19KvCacheTransferMode4DRAME"></span><span class="target" id="types_8h_1a345a3e90232624cc0c5adf7896ca273eaebae17841ce69e653df838d8c20ace8d"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">DRAM</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode4DRAME" title="Link to this definition">#</a><br /></dt>
+<dd><p>Copy to/from CPU memory (original approach). </p>
+</dd></dl>
+
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode3GDSE">
+<span id="_CPPv3N12tensorrt_llm8executor19KvCacheTransferMode3GDSE"></span><span id="_CPPv2N12tensorrt_llm8executor19KvCacheTransferMode3GDSE"></span><span class="target" id="types_8h_1a345a3e90232624cc0c5adf7896ca273ea495ebb08d96fd1ef02a05992b75a58e0"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">GDS</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode3GDSE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Attempt GPUDirect Storage (cuFile). </p>
+</dd></dl>
+
+<dl class="cpp enumerator">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode20POSIX_DEBUG_FALLBACKE">
+<span id="_CPPv3N12tensorrt_llm8executor19KvCacheTransferMode20POSIX_DEBUG_FALLBACKE"></span><span id="_CPPv2N12tensorrt_llm8executor19KvCacheTransferMode20POSIX_DEBUG_FALLBACKE"></span><span class="target" id="types_8h_1a345a3e90232624cc0c5adf7896ca273ea61d064a0dc44ecccf71241b373282ebe"></span><span class="k"><span class="pre">enumerator</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">POSIX_DEBUG_FALLBACK</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode20POSIX_DEBUG_FALLBACKE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Force a POSIX read/write for debugging. </p>
+</dd></dl>
+
+</dd></dl>
+
 </div>
 <div class="breathe-sectiondef docutils container">
 <p class="breathe-sectiondef-title rubric" id="breathe-section-title-functions">Functions</p>
@@ -3787,6 +4418,12 @@
 <dd><p>Stats specific to inflight batching. </p>
 </dd></dl>
 
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor14IterationStats12specDecStatsE">
+<span id="_CPPv3N12tensorrt_llm8executor14IterationStats12specDecStatsE"></span><span id="_CPPv2N12tensorrt_llm8executor14IterationStats12specDecStatsE"></span><span id="tensorrt_llm::executor::IterationStats::specDecStats__std::optional:SpecDecodingStats:"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1IterationStats_1a053228e657564091c3de901f262523b6"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">optional</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE" title="tensorrt_llm::executor::SpecDecodingStats"><span class="n"><span class="pre">SpecDecodingStats</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">specDecStats</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor14IterationStats12specDecStatsE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Stats specific to speculative decoding. </p>
+</dd></dl>
+
 </div>
 </dd></dl>
 
@@ -4133,6 +4770,53 @@
 </div>
 </dd></dl>
 
+<dl class="cpp struct">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE">
+<span id="_CPPv3N12tensorrt_llm8executor17SpecDecodingStatsE"></span><span id="_CPPv2N12tensorrt_llm8executor17SpecDecodingStatsE"></span><span id="tensorrt_llm::executor::SpecDecodingStats"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1SpecDecodingStats"></span><span class="k"><span class="pre">struct</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">SpecDecodingStats</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="docutils container">
+<em>#include &lt;types.h&gt;</em></div>
+<p>Struct that holds speculative decoding stats. </p>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-members">Public Members</p>
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor17SpecDecodingStats14numDraftTokensE">
+<span id="_CPPv3N12tensorrt_llm8executor17SpecDecodingStats14numDraftTokensE"></span><span id="_CPPv2N12tensorrt_llm8executor17SpecDecodingStats14numDraftTokensE"></span><span id="tensorrt_llm::executor::SpecDecodingStats::numDraftTokens__SizeType64"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1SpecDecodingStats_1a2ba8cdb4391b566673ac59fbce115cb0"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor10SizeType64E" title="tensorrt_llm::executor::SizeType64"><span class="n"><span class="pre">SizeType64</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">numDraftTokens</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats14numDraftTokensE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Total number of proposed draft tokens for all requests. </p>
+</dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor17SpecDecodingStats17numAcceptedTokensE">
+<span id="_CPPv3N12tensorrt_llm8executor17SpecDecodingStats17numAcceptedTokensE"></span><span id="_CPPv2N12tensorrt_llm8executor17SpecDecodingStats17numAcceptedTokensE"></span><span id="tensorrt_llm::executor::SpecDecodingStats::numAcceptedTokens__SizeType64"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1SpecDecodingStats_1a82d228d94c33c11b41707c2275eb9157"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor10SizeType64E" title="tensorrt_llm::executor::SizeType64"><span class="n"><span class="pre">SizeType64</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">numAcceptedTokens</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats17numAcceptedTokensE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Total number of accepted draft tokens for all requests. </p>
+</dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor17SpecDecodingStats26numRequestsWithDraftTokensE">
+<span id="_CPPv3N12tensorrt_llm8executor17SpecDecodingStats26numRequestsWithDraftTokensE"></span><span id="_CPPv2N12tensorrt_llm8executor17SpecDecodingStats26numRequestsWithDraftTokensE"></span><span id="tensorrt_llm::executor::SpecDecodingStats::numRequestsWithDraftTokens__SizeType64"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1SpecDecodingStats_1af6b842c2bf45eb3c5711fafb45d6b004"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor10SizeType64E" title="tensorrt_llm::executor::SizeType64"><span class="n"><span class="pre">SizeType64</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">numRequestsWithDraftTokens</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats26numRequestsWithDraftTokensE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Number of requests with at least one draft token in batch. </p>
+</dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor17SpecDecodingStats16acceptanceLengthE">
+<span id="_CPPv3N12tensorrt_llm8executor17SpecDecodingStats16acceptanceLengthE"></span><span id="_CPPv2N12tensorrt_llm8executor17SpecDecodingStats16acceptanceLengthE"></span><span id="tensorrt_llm::executor::SpecDecodingStats::acceptanceLength__double"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1SpecDecodingStats_1a4581cb18cb060aff99d56a7bc2ace5b0"></span><span class="kt"><span class="pre">double</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">acceptanceLength</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats16acceptanceLengthE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Acceptance length, defined as average number of tokens produced per step for all requests with at least one draft token. </p>
+</dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13iterLatencyMSE">
+<span id="_CPPv3N12tensorrt_llm8executor17SpecDecodingStats13iterLatencyMSE"></span><span id="_CPPv2N12tensorrt_llm8executor17SpecDecodingStats13iterLatencyMSE"></span><span id="tensorrt_llm::executor::SpecDecodingStats::iterLatencyMS__double"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1SpecDecodingStats_1abdfdf1844f06c033b41caf259c28859c"></span><span class="kt"><span class="pre">double</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">iterLatencyMS</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13iterLatencyMSE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Iteration latency for draft token generation only (ms) </p>
+</dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13draftOverheadE">
+<span id="_CPPv3N12tensorrt_llm8executor17SpecDecodingStats13draftOverheadE"></span><span id="_CPPv2N12tensorrt_llm8executor17SpecDecodingStats13draftOverheadE"></span><span id="tensorrt_llm::executor::SpecDecodingStats::draftOverhead__double"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1SpecDecodingStats_1a5112514dcef0134ab8e349ab0090553c"></span><span class="kt"><span class="pre">double</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">draftOverhead</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13draftOverheadE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Draft overhead, defined as iterLatencyMS (specdec) / iterLatencyMS (total) </p>
+</dd></dl>
+
+</div>
+</dd></dl>
+
 <dl class="cpp struct">
 <dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor19StaticBatchingStatsE">
 <span id="_CPPv3N12tensorrt_llm8executor19StaticBatchingStatsE"></span><span id="_CPPv2N12tensorrt_llm8executor19StaticBatchingStatsE"></span><span id="tensorrt_llm::executor::StaticBatchingStats"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1StaticBatchingStats"></span><span class="k"><span class="pre">struct</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">StaticBatchingStats</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor19StaticBatchingStatsE" title="Link to this definition">#</a><br /></dt>
@@ -6909,16 +7593,18 @@
 <dd></dd></dl>
 
 <dl class="cpp function">
-<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE">
-<span id="_CPPv3N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE"></span><span id="_CPPv2N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE"></span><span id="tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig__std::vector:TokenRangeRetentionConfig:CR.RetentionPriority.std::optional:std::chrono::milliseconds:"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1KvCacheRetentionConfig_1a169602b126b3210e4e48031c319d2a10"></span><span class="k"><span class="pre">explicit</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">KvCacheRetentionConfig</span></span></span><span class="sig-paren">(</span>
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE">
+<span id="_CPPv3N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE"></span><span id="_CPPv2N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE"></span><span id="tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig__std::vector:TokenRangeRetentionConfig:CR.RetentionPriority.std::optional:std::chrono::milliseconds:.KvCacheTransferMode.std::optional:ss:"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1KvCacheRetentionConfig_1ace9a340c9336c06ad0bec7959aff3662"></span><span class="k"><span class="pre">explicit</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">KvCacheRetentionConfig</span></span></span><span class="sig-paren">(</span>
 
 <dl>
 <dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigE" title="tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig"><span class="n"><span class="pre">TokenRangeRetentionConfig</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">tokenRangeRetentionPriorities</span></span></em>,</dd>
 <dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor17RetentionPriorityE" title="tensorrt_llm::executor::RetentionPriority"><span class="n"><span class="pre">RetentionPriority</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">decodeRetentionPriority</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25kDefaultRetentionPriorityE" title="tensorrt_llm::executor::KvCacheRetentionConfig::kDefaultRetentionPriority"><span class="n"><span class="pre">kDefaultRetentionPriority</span></span></a></em>,</dd>
 <dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">optional</span></span><span class="p"><span class="pre">&lt;</span></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">chrono</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">milliseconds</span></span><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">decodeDurationMs</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">nullopt</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE" title="tensorrt_llm::executor::KvCacheTransferMode"><span class="n"><span class="pre">KvCacheTransferMode</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">transferMode</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE" title="tensorrt_llm::executor::KvCacheTransferMode"><span class="n"><span class="pre">KvCacheTransferMode</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode4DRAME" title="tensorrt_llm::executor::KvCacheTransferMode::DRAM"><span class="n"><span class="pre">DRAM</span></span></a></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">optional</span></span><span class="p"><span class="pre">&lt;</span></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">directory</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">nullopt</span></span></em>,</dd>
 </dl>
 
-<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE" title="Link to this definition">#</a><br /></dt>
+<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
 <dl class="cpp function">
@@ -6946,6 +7632,16 @@
 <span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig19getDecodeDurationMsEv" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig15getTransferModeEv">
+<span id="_CPPv3NK12tensorrt_llm8executor22KvCacheRetentionConfig15getTransferModeEv"></span><span id="_CPPv2NK12tensorrt_llm8executor22KvCacheRetentionConfig15getTransferModeEv"></span><span id="tensorrt_llm::executor::KvCacheRetentionConfig::getTransferModeC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1KvCacheRetentionConfig_1aa5c8153595583b1ab1953281321c3c4a"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE" title="tensorrt_llm::executor::KvCacheTransferMode"><span class="n"><span class="pre">KvCacheTransferMode</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getTransferMode</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig15getTransferModeEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig12getDirectoryEv">
+<span id="_CPPv3NK12tensorrt_llm8executor22KvCacheRetentionConfig12getDirectoryEv"></span><span id="_CPPv2NK12tensorrt_llm8executor22KvCacheRetentionConfig12getDirectoryEv"></span><span id="tensorrt_llm::executor::KvCacheRetentionConfig::getDirectoryC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1KvCacheRetentionConfig_1ad113a79ac9102d1b82fbb8e8f46acb33"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">optional</span></span><span class="p"><span class="pre">&lt;</span></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getDirectory</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig12getDirectoryEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp function">
 <dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32">
 <span id="_CPPv3NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32"></span><span id="_CPPv2NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32"></span><span id="tensorrt_llm::executor::KvCacheRetentionConfig::getPerBlockRetentionPriorityDuration__SizeType32.SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1KvCacheRetentionConfig_1afd4e5ccd65e62f5d6bdba3bef0b116f0"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDurationE" title="tensorrt_llm::executor::RetentionPriorityAndDuration"><span class="n"><span class="pre">RetentionPriorityAndDuration</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getPerBlockRetentionPriorityDuration</span></span></span><span class="sig-paren">(</span>
@@ -7003,6 +7699,18 @@
 <dd><p>The duration in ms that decode blocks should remain at their assigned priority level. </p>
 </dd></dl>
 
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig13mTransferModeE">
+<span id="_CPPv3N12tensorrt_llm8executor22KvCacheRetentionConfig13mTransferModeE"></span><span id="_CPPv2N12tensorrt_llm8executor22KvCacheRetentionConfig13mTransferModeE"></span><span id="tensorrt_llm::executor::KvCacheRetentionConfig::mTransferMode__KvCacheTransferMode"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1KvCacheRetentionConfig_1a5ca662666b3272cd793e25712b36c3cf"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE" title="tensorrt_llm::executor::KvCacheTransferMode"><span class="n"><span class="pre">KvCacheTransferMode</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mTransferMode</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig13mTransferModeE" title="Link to this definition">#</a><br /></dt>
+<dd><p>The transfer mode for the block. </p>
+</dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig10mDirectoryE">
+<span id="_CPPv3N12tensorrt_llm8executor22KvCacheRetentionConfig10mDirectoryE"></span><span id="_CPPv2N12tensorrt_llm8executor22KvCacheRetentionConfig10mDirectoryE"></span><span id="tensorrt_llm::executor::KvCacheRetentionConfig::mDirectory__std::optional:ss:"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1KvCacheRetentionConfig_1a444fbbc0fd47f6fb958346c1723e7fc6"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">optional</span></span><span class="p"><span class="pre">&lt;</span></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mDirectory</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig10mDirectoryE" title="Link to this definition">#</a><br /></dt>
+<dd><p>Name of the directory if transfer mode is GDS or POSIX_DEBUG_FALLBACK. </p>
+</dd></dl>
+
 </div>
 <dl class="cpp struct">
 <dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigE">
@@ -9796,7 +10504,48 @@
 <dl class="cpp type">
 <dt class="sig sig-object cpp">
 <span class="target" id="namespacetensorrt__llm_1_1executor_1_1kv__cache"></span><span class="k"><span class="pre">namespace</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">kv_cache</span></span></span><br /></dt>
-<dd><dl class="cpp class">
+<dd><dl class="cpp struct">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10AgentStateE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10AgentStateE"></span><span id="tensorrt_llm::executor::kv_cache::AgentState"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentState"></span><span class="k"><span class="pre">struct</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">AgentState</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE" title="Link to this definition">#</a><br /></dt>
+<dd><div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-functions">Public Functions</p>
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE"></span><span id="tensorrt_llm::executor::kv_cache::AgentState::AgentState__ss.ss"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentState_1a3811da59ccda26510ef568538f23ad8f"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">AgentState</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">agentName</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">connectionInfo</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateEv">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateEv"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateEv"></span><span id="tensorrt_llm::executor::kv_cache::AgentState::AgentState"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentState_1a93c96b1f59e2493d5f52dbcd9943ca0e"></span><span class="sig-name descname"><span class="n"><span class="pre">AgentState</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="k"><span class="pre">default</span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentStateeqERK10AgentState">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache10AgentStateeqERK10AgentState"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache10AgentStateeqERK10AgentState"></span><span id="tensorrt_llm::executor::kv_cache::AgentState::eq-operator__AgentStateCRC"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentState_1ae4b36a422b23fa4c630a29ec3cf21896"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="kt"><span class="pre">bool</span></span><span class="w"> </span><span class="sig-name descname"><span class="k"><span class="pre">operator</span></span><span class="o"><span class="pre">==</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE" title="tensorrt_llm::executor::kv_cache::AgentState"><span class="n"><span class="pre">AgentState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="n sig-param"><span class="pre">other</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentStateeqERK10AgentState" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentState8toStringEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache10AgentState8toStringEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache10AgentState8toStringEv"></span><span id="tensorrt_llm::executor::kv_cache::AgentState::toStringC"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentState_1a67247a110909a7d635c6e66e12692b1f"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">toString</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentState8toStringEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+<div class="breathe-sectiondef docutils container">
+<p class="breathe-sectiondef-title rubric" id="breathe-section-title-public-members">Public Members</p>
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10mAgentNameE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10AgentState10mAgentNameE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10AgentState10mAgentNameE"></span><span id="tensorrt_llm::executor::kv_cache::AgentState::mAgentName__ss"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentState_1a6c452cc2a53a6f569ac6faf6dd427e86"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mAgentName</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10mAgentNameE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState15mConnectionInfoE">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache10AgentState15mConnectionInfoE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10AgentState15mConnectionInfoE"></span><span id="tensorrt_llm::executor::kv_cache::AgentState::mConnectionInfo__ss"></span><span class="target" id="structtensorrt__llm_1_1executor_1_1kv__cache_1_1AgentState_1a50683a09065e14e0a198337d9d8c6a79"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mConnectionInfo</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState15mConnectionInfoE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
+</div>
+</dd></dl>
+
+<dl class="cpp class">
 <dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache10CacheStateE">
 <span id="_CPPv3N12tensorrt_llm8executor8kv_cache10CacheStateE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache10CacheStateE"></span><span id="tensorrt_llm::executor::kv_cache::CacheState"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CacheState"></span><span class="k"><span class="pre">class</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">CacheState</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10CacheStateE" title="Link to this definition">#</a><br /></dt>
 <dd><div class="breathe-sectiondef docutils container">
@@ -10085,6 +10834,18 @@
 <span id="_CPPv3N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE"></span><span id="tensorrt_llm::executor::kv_cache::CommState::CommState__std::uint16_t.ss"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1a1d0d0238380c8e5fac3aa86ba42042b2"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">CommState</span></span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">uint16_t</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">port</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">string</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">ip</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi">
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi"></span><span id="tensorrt_llm::executor::kv_cache::CommState::CommState__std::vector:AgentState:.i"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1ae00620154dcc41776f8c9f32f071c86b"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="k"><span class="pre">explicit</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">CommState</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE" title="tensorrt_llm::executor::kv_cache::AgentState"><span class="n"><span class="pre">AgentState</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">agentState</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="kt"><span class="pre">int</span></span><span class="w"> </span><span class="n sig-param"><span class="pre">selfIdx</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="o"><span class="pre">-</span></span><span class="m"><span class="pre">1</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp function">
 <dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv">
 <span id="_CPPv3NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv"></span><span id="tensorrt_llm::executor::kv_cache::CommState::isMpiStateC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1adf13114c0a7a8e9b4152b930a320575a"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="kt"><span class="pre">bool</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">isMpiState</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv" title="Link to this definition">#</a><br /></dt>
@@ -10095,6 +10856,11 @@
 <span id="_CPPv3NK12tensorrt_llm8executor8kv_cache9CommState13isSocketStateEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache9CommState13isSocketStateEv"></span><span id="tensorrt_llm::executor::kv_cache::CommState::isSocketStateC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1aa0b16fbb6bbea11cb489205c1b096293"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="kt"><span class="pre">bool</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">isSocketState</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13isSocketStateEv" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState12isAgentStateEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache9CommState12isAgentStateEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache9CommState12isAgentStateEv"></span><span id="tensorrt_llm::executor::kv_cache::CommState::isAgentStateC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1a9642d2551eac95665ca271c81d5369c3"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="kt"><span class="pre">bool</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">isAgentState</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState12isAgentStateEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp function">
 <dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv">
 <span id="_CPPv3NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv"></span><span id="tensorrt_llm::executor::kv_cache::CommState::getMpiStateC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1a227b9b9ab50d2c3dfde628f0fe038f32"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache8MpiStateE" title="tensorrt_llm::executor::kv_cache::MpiState"><span class="n"><span class="pre">MpiState</span></span></a><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getMpiState</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv" title="Link to this definition">#</a><br /></dt>
@@ -10105,6 +10871,11 @@
 <span id="_CPPv3NK12tensorrt_llm8executor8kv_cache9CommState14getSocketStateEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache9CommState14getSocketStateEv"></span><span id="tensorrt_llm::executor::kv_cache::CommState::getSocketStateC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1a36da005c9ce6ede8d38861a265dabc97"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SocketStateE" title="tensorrt_llm::executor::kv_cache::SocketState"><span class="n"><span class="pre">SocketState</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getSocketState</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState14getSocketStateEv" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13getAgentStateEv">
+<span id="_CPPv3NK12tensorrt_llm8executor8kv_cache9CommState13getAgentStateEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache9CommState13getAgentStateEv"></span><span id="tensorrt_llm::executor::kv_cache::CommState::getAgentStateC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1af0d5eaf4f688a59cf1e1a443fc1cf4d6"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE" title="tensorrt_llm::executor::kv_cache::AgentState"><span class="n"><span class="pre">AgentState</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="p"><span class="pre">&amp;</span></span><span class="sig-name descname"><span class="n"><span class="pre">getAgentState</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13getAgentStateEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp function">
 <dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv">
 <span id="_CPPv3NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv"></span><span id="_CPPv2NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv"></span><span id="tensorrt_llm::executor::kv_cache::CommState::getSelfIdxC"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1a7b7d55568bced2fe9449f9ea5320cdc0"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><span class="kt"><span class="pre">int</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getSelfIdx</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><span class="w"> </span><span class="k"><span class="pre">noexcept</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv" title="Link to this definition">#</a><br /></dt>
@@ -10125,7 +10896,7 @@
 <p class="breathe-sectiondef-title rubric" id="breathe-section-title-private-members">Private Members</p>
 <dl class="cpp var">
 <dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm8executor8kv_cache9CommState6mStateE">
-<span id="_CPPv3N12tensorrt_llm8executor8kv_cache9CommState6mStateE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache9CommState6mStateE"></span><span id="tensorrt_llm::executor::kv_cache::CommState::mState__std::variant:std::monostate.MpiState.std::vector:SocketState::"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1a2f6cfa08e438ed1763ff62d9f9e64c98"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">variant</span></span><span class="p"><span class="pre">&lt;</span></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">monostate</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache8MpiStateE" title="tensorrt_llm::executor::kv_cache::MpiState"><span class="n"><span class="pre">MpiState</span></span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SocketStateE" title="tensorrt_llm::executor::kv_cache::SocketState"><span class="n"><span class="pre">SocketState</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mState</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState6mStateE" title="Link to this definition">#</a><br /></dt>
+<span id="_CPPv3N12tensorrt_llm8executor8kv_cache9CommState6mStateE"></span><span id="_CPPv2N12tensorrt_llm8executor8kv_cache9CommState6mStateE"></span><span id="tensorrt_llm::executor::kv_cache::CommState::mState__std::variant:std::monostate.MpiState.std::vector:SocketState:.std::vector:AgentState::"></span><span class="target" id="classtensorrt__llm_1_1executor_1_1kv__cache_1_1CommState_1a7e9796e81b194b769e04845efdcf2516"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">variant</span></span><span class="p"><span class="pre">&lt;</span></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">monostate</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache8MpiStateE" title="tensorrt_llm::executor::kv_cache::MpiState"><span class="n"><span class="pre">MpiState</span></span></a><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SocketStateE" title="tensorrt_llm::executor::kv_cache::SocketState"><span class="n"><span class="pre">SocketState</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE" title="tensorrt_llm::executor::kv_cache::AgentState"><span class="n"><span class="pre">AgentState</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mState</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState6mStateE" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
 <dl class="cpp var">
@@ -10215,11 +10986,6 @@
 <span class="target" id="namespacetensorrt__llm"></span><span class="k"><span class="pre">namespace</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">tensorrt_llm</span></span></span><br /></dt>
 <dd><dl class="cpp type">
 <dt class="sig sig-object cpp">
-<span class="target" id="namespacetensorrt__llm_1_1batch__manager"></span><span class="k"><span class="pre">namespace</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">batch_manager</span></span></span><br /></dt>
-<dd></dd></dl>
-
-<dl class="cpp type">
-<dt class="sig sig-object cpp">
 <span class="target" id="namespacetensorrt__llm_1_1executor"></span><span class="k"><span class="pre">namespace</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">executor</span></span></span><br /></dt>
 <dd><dl class="cpp type">
 <dt class="sig sig-object cpp">
@@ -10496,6 +11262,112 @@
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtimeE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::runtime</span></code></a></li>
 </ul>
 </li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#transferagent-h">transferAgent.h</a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cacheE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE"><code class="docutils literal notranslate"><span class="pre">TransferDescs</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache13RegisterDescsE"><code class="docutils literal notranslate"><span class="pre">RegisterDescs</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE"><code class="docutils literal notranslate"><span class="pre">SyncMessage</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE"><code class="docutils literal notranslate"><span class="pre">ConnectionInfoType</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryTypeE"><code class="docutils literal notranslate"><span class="pre">MemoryType</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kDRAME"><code class="docutils literal notranslate"><span class="pre">kDRAM</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kVRAME"><code class="docutils literal notranslate"><span class="pre">kVRAM</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kBLKE"><code class="docutils literal notranslate"><span class="pre">kBLK</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kOBJE"><code class="docutils literal notranslate"><span class="pre">kOBJ</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kFILEE"><code class="docutils literal notranslate"><span class="pre">kFILE</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOpE"><code class="docutils literal notranslate"><span class="pre">TransferOp</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp5kREADE"><code class="docutils literal notranslate"><span class="pre">kREAD</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp6kWRITEE"><code class="docutils literal notranslate"><span class="pre">kWRITE</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args"><code class="docutils literal notranslate"><span class="pre">makeTransferAgent()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDescE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::AgentDesc</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc9AgentDescENSt6stringE"><code class="docutils literal notranslate"><span class="pre">AgentDesc()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9AgentDesc19getBackendAgentDescEv"><code class="docutils literal notranslate"><span class="pre">getBackendAgentDesc()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc17mBackendAgentDescE"><code class="docutils literal notranslate"><span class="pre">mBackendAgentDesc</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfigE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::BaseAgentConfig</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig5mNameE"><code class="docutils literal notranslate"><span class="pre">mName</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig13useProgThreadE"><code class="docutils literal notranslate"><span class="pre">useProgThread</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::BaseTransferAgent</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentD0Ev"><code class="docutils literal notranslate"><span class="pre">~BaseTransferAgent()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent14registerMemoryERK13RegisterDescs"><code class="docutils literal notranslate"><span class="pre">registerMemory()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16deregisterMemoryERK13RegisterDescs"><code class="docutils literal notranslate"><span class="pre">deregisterMemory()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc"><code class="docutils literal notranslate"><span class="pre">loadRemoteAgent()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getLocalAgentDescEv"><code class="docutils literal notranslate"><span class="pre">getLocalAgentDesc()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent21invalidateRemoteAgentERKNSt6stringE"><code class="docutils literal notranslate"><span class="pre">invalidateRemoteAgent()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent22submitTransferRequestsERK15TransferRequest"><code class="docutils literal notranslate"><span class="pre">submitTransferRequests()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage"><code class="docutils literal notranslate"><span class="pre">notifySyncMessage()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent23getNotifiedSyncMessagesEv"><code class="docutils literal notranslate"><span class="pre">getNotifiedSyncMessages()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getConnectionInfoEv"><code class="docutils literal notranslate"><span class="pre">getConnectionInfo()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType"><code class="docutils literal notranslate"><span class="pre">connectRemoteAgent()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs"><code class="docutils literal notranslate"><span class="pre">checkRemoteDescs()</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::DynLibLoader</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9getHandleERKNSt6stringE"><code class="docutils literal notranslate"><span class="pre">getHandle()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE"><code class="docutils literal notranslate"><span class="pre">getFunctionPointer()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderD0Ev"><code class="docutils literal notranslate"><span class="pre">~DynLibLoader()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderEv"><code class="docutils literal notranslate"><span class="pre">DynLibLoader()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderERK12DynLibLoader"><code class="docutils literal notranslate"><span class="pre">DynLibLoader()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderaSERK12DynLibLoader"><code class="docutils literal notranslate"><span class="pre">operator=()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader11getInstanceEv"><code class="docutils literal notranslate"><span class="pre">getInstance()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mDllMutexE"><code class="docutils literal notranslate"><span class="pre">mDllMutex</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mHandlersE"><code class="docutils literal notranslate"><span class="pre">mHandlers</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc"><code class="docutils literal notranslate"><span class="pre">dlSym()</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::MemoryDesc</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t"><code class="docutils literal notranslate"><span class="pre">MemoryDesc()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t"><code class="docutils literal notranslate"><span class="pre">MemoryDesc()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t"><code class="docutils literal notranslate"><span class="pre">MemoryDesc()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc7getAddrEv"><code class="docutils literal notranslate"><span class="pre">getAddr()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc6getLenEv"><code class="docutils literal notranslate"><span class="pre">getLen()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc11getDeviceIdEv"><code class="docutils literal notranslate"><span class="pre">getDeviceId()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE"><code class="docutils literal notranslate"><span class="pre">serialize()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc11deserializeERNSt7istreamE"><code class="docutils literal notranslate"><span class="pre">deserialize()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc14serializedSizeERK10MemoryDesc"><code class="docutils literal notranslate"><span class="pre">serializedSize()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc5mAddrE"><code class="docutils literal notranslate"><span class="pre">mAddr</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc4mLenE"><code class="docutils literal notranslate"><span class="pre">mLen</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9mDeviceIdE"><code class="docutils literal notranslate"><span class="pre">mDeviceId</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescsE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::MemoryDescs</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE"><code class="docutils literal notranslate"><span class="pre">MemoryDescs()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs7getTypeEv"><code class="docutils literal notranslate"><span class="pre">getType()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs8getDescsEv"><code class="docutils literal notranslate"><span class="pre">getDescs()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs5mTypeE"><code class="docutils literal notranslate"><span class="pre">mType</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs6mDescsE"><code class="docutils literal notranslate"><span class="pre">mDescs</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequestE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::TransferRequest</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE"><code class="docutils literal notranslate"><span class="pre">TransferRequest()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest5getOpEv"><code class="docutils literal notranslate"><span class="pre">getOp()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getSrcDescsEv"><code class="docutils literal notranslate"><span class="pre">getSrcDescs()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getDstDescsEv"><code class="docutils literal notranslate"><span class="pre">getDstDescs()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest13getRemoteNameEv"><code class="docutils literal notranslate"><span class="pre">getRemoteName()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest14getSyncMessageEv"><code class="docutils literal notranslate"><span class="pre">getSyncMessage()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest3mOpE"><code class="docutils literal notranslate"><span class="pre">mOp</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mSrcDescsE"><code class="docutils literal notranslate"><span class="pre">mSrcDescs</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mDstDescsE"><code class="docutils literal notranslate"><span class="pre">mDstDescs</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest11mRemoteNameE"><code class="docutils literal notranslate"><span class="pre">mRemoteName</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest12mSyncMessageE"><code class="docutils literal notranslate"><span class="pre">mSyncMessage</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::TransferStatus</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusD0Ev"><code class="docutils literal notranslate"><span class="pre">~TransferStatus()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus11isCompletedEv"><code class="docutils literal notranslate"><span class="pre">isCompleted()</span></code></a></li>
+<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus4waitEv"><code class="docutils literal notranslate"><span class="pre">wait()</span></code></a></li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</li>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#serialization-h">serialization.h</a><ul class="nav section-nav flex-column">
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13SerializationE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::Serialization</span></code></a><ul class="nav section-nav flex-column">
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization20deserializeTimePointERNSt7istreamE"><code class="docutils literal notranslate"><span class="pre">deserializeTimePoint()</span></code></a></li>
@@ -10531,6 +11403,9 @@
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization22deserializeSocketStateERNSt7istreamE"><code class="docutils literal notranslate"><span class="pre">deserializeSocketState()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE"><code class="docutils literal notranslate"><span class="pre">serialize()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE"><code class="docutils literal notranslate"><span class="pre">serializedSize()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization21deserializeAgentStateERNSt7istreamE"><code class="docutils literal notranslate"><span class="pre">deserializeAgentState()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE"><code class="docutils literal notranslate"><span class="pre">serialize()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10AgentStateE"><code class="docutils literal notranslate"><span class="pre">serializedSize()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE"><code class="docutils literal notranslate"><span class="pre">deserializeCacheState()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE"><code class="docutils literal notranslate"><span class="pre">serialize()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10CacheStateE"><code class="docutils literal notranslate"><span class="pre">serializedSize()</span></code></a></li>
@@ -10628,6 +11503,9 @@
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization32deserializeInflightBatchingStatsERNSt7istreamE"><code class="docutils literal notranslate"><span class="pre">deserializeInflightBatchingStats()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE"><code class="docutils literal notranslate"><span class="pre">serialize()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats"><code class="docutils literal notranslate"><span class="pre">serializedSize()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization28deserializeSpecDecodingStatsERNSt7istreamE"><code class="docutils literal notranslate"><span class="pre">deserializeSpecDecodingStats()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE"><code class="docutils literal notranslate"><span class="pre">serialize()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK17SpecDecodingStats"><code class="docutils literal notranslate"><span class="pre">serializedSize()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE"><code class="docutils literal notranslate"><span class="pre">deserializeIterationStats()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt7istreamE"><code class="docutils literal notranslate"><span class="pre">deserializeIterationStats()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE"><code class="docutils literal notranslate"><span class="pre">serialize()</span></code></a></li>
@@ -10656,12 +11534,12 @@
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor13Serialization20deserializeModelTypeERNSt7istreamE"><code class="docutils literal notranslate"><span class="pre">deserializeModelType()</span></code></a></li>
 </ul>
 </li>
-<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cacheE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache</span></code></a></li>
 </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#types-h">types.h</a><ul class="nav section-nav flex-column">
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor9TensorPtrE"><code class="docutils literal notranslate"><span class="pre">TensorPtr</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor10SizeType32E"><code class="docutils literal notranslate"><span class="pre">SizeType32</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor10SizeType64E"><code class="docutils literal notranslate"><span class="pre">SizeType64</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor9FloatTypeE"><code class="docutils literal notranslate"><span class="pre">FloatType</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor11TokenIdTypeE"><code class="docutils literal notranslate"><span class="pre">TokenIdType</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor9VecTokensE"><code class="docutils literal notranslate"><span class="pre">VecTokens</span></code></a></li>
@@ -10756,6 +11634,12 @@
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor12FinishReason10kCANCELLEDE"><code class="docutils literal notranslate"><span class="pre">kCANCELLED</span></code></a></li>
 </ul>
 </li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE"><code class="docutils literal notranslate"><span class="pre">KvCacheTransferMode</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode4DRAME"><code class="docutils literal notranslate"><span class="pre">DRAM</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode3GDSE"><code class="docutils literal notranslate"><span class="pre">GDS</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode20POSIX_DEBUG_FALLBACKE"><code class="docutils literal notranslate"><span class="pre">POSIX_DEBUG_FALLBACK</span></code></a></li>
+</ul>
+</li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE23CapacitySchedulerPolicy"><code class="docutils literal notranslate"><span class="pre">operator&lt;&lt;()</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE21ContextChunkingPolicy"><code class="docutils literal notranslate"><span class="pre">operator&lt;&lt;()</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIterationE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::DebugTensorsPerIteration</span></code></a><ul class="nav section-nav flex-column">
@@ -10891,6 +11775,7 @@
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor14IterationStats17crossKvCacheStatsE"><code class="docutils literal notranslate"><span class="pre">crossKvCacheStats</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor14IterationStats19staticBatchingStatsE"><code class="docutils literal notranslate"><span class="pre">staticBatchingStats</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor14IterationStats21inflightBatchingStatsE"><code class="docutils literal notranslate"><span class="pre">inflightBatchingStats</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor14IterationStats12specDecStatsE"><code class="docutils literal notranslate"><span class="pre">specDecStats</span></code></a></li>
 </ul>
 </li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor12KvCacheStatsE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::KvCacheStats</span></code></a><ul class="nav section-nav flex-column">
@@ -10960,6 +11845,15 @@
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor24RequestStatsPerIteration12requestStatsE"><code class="docutils literal notranslate"><span class="pre">requestStats</span></code></a></li>
 </ul>
 </li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::SpecDecodingStats</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats14numDraftTokensE"><code class="docutils literal notranslate"><span class="pre">numDraftTokens</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats17numAcceptedTokensE"><code class="docutils literal notranslate"><span class="pre">numAcceptedTokens</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats26numRequestsWithDraftTokensE"><code class="docutils literal notranslate"><span class="pre">numRequestsWithDraftTokens</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats16acceptanceLengthE"><code class="docutils literal notranslate"><span class="pre">acceptanceLength</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13iterLatencyMSE"><code class="docutils literal notranslate"><span class="pre">iterLatencyMS</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13draftOverheadE"><code class="docutils literal notranslate"><span class="pre">draftOverhead</span></code></a></li>
+</ul>
+</li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor19StaticBatchingStatsE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::StaticBatchingStats</span></code></a><ul class="nav section-nav flex-column">
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor19StaticBatchingStats20numScheduledRequestsE"><code class="docutils literal notranslate"><span class="pre">numScheduledRequests</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor19StaticBatchingStats18numContextRequestsE"><code class="docutils literal notranslate"><span class="pre">numContextRequests</span></code></a></li>
@@ -11396,10 +12290,12 @@
 </li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfigE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::KvCacheRetentionConfig</span></code></a><ul class="nav section-nav flex-column">
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigEv"><code class="docutils literal notranslate"><span class="pre">KvCacheRetentionConfig()</span></code></a></li>
-<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE"><code class="docutils literal notranslate"><span class="pre">KvCacheRetentionConfig()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE"><code class="docutils literal notranslate"><span class="pre">KvCacheRetentionConfig()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig29getTokenRangeRetentionConfigsEv"><code class="docutils literal notranslate"><span class="pre">getTokenRangeRetentionConfigs()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig26getDecodeRetentionPriorityEv"><code class="docutils literal notranslate"><span class="pre">getDecodeRetentionPriority()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig19getDecodeDurationMsEv"><code class="docutils literal notranslate"><span class="pre">getDecodeDurationMs()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig15getTransferModeEv"><code class="docutils literal notranslate"><span class="pre">getTransferMode()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig12getDirectoryEv"><code class="docutils literal notranslate"><span class="pre">getDirectory()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">getPerBlockRetentionPriorityDuration()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfigeqERK22KvCacheRetentionConfig"><code class="docutils literal notranslate"><span class="pre">operator==()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMinRetentionPriorityE"><code class="docutils literal notranslate"><span class="pre">kMinRetentionPriority</span></code></a></li>
@@ -11408,6 +12304,8 @@
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig27mTokenRangeRetentionConfigsE"><code class="docutils literal notranslate"><span class="pre">mTokenRangeRetentionConfigs</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig24mDecodeRetentionPriorityE"><code class="docutils literal notranslate"><span class="pre">mDecodeRetentionPriority</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig17mDecodeDurationMsE"><code class="docutils literal notranslate"><span class="pre">mDecodeDurationMs</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig13mTransferModeE"><code class="docutils literal notranslate"><span class="pre">mTransferMode</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig10mDirectoryE"><code class="docutils literal notranslate"><span class="pre">mDirectory</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig</span></code></a><ul class="nav section-nav flex-column">
 <li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE"><code class="docutils literal notranslate"><span class="pre">TokenRangeRetentionConfig()</span></code></a></li>
 <li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigeqERK25TokenRangeRetentionConfig"><code class="docutils literal notranslate"><span class="pre">operator==()</span></code></a></li>
@@ -11824,6 +12722,15 @@
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor20DataTransceiverState10mCommStateE"><code class="docutils literal notranslate"><span class="pre">mCommState</span></code></a></li>
 </ul>
 </li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::AgentState</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE"><code class="docutils literal notranslate"><span class="pre">AgentState()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateEv"><code class="docutils literal notranslate"><span class="pre">AgentState()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentStateeqERK10AgentState"><code class="docutils literal notranslate"><span class="pre">operator==()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentState8toStringEv"><code class="docutils literal notranslate"><span class="pre">toString()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10mAgentNameE"><code class="docutils literal notranslate"><span class="pre">mAgentName</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState15mConnectionInfoE"><code class="docutils literal notranslate"><span class="pre">mConnectionInfo</span></code></a></li>
+</ul>
+</li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10CacheStateE"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm::executor::kv_cache::CacheState</span></code></a><ul class="nav section-nav flex-column">
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionTypeE"><code class="docutils literal notranslate"><span class="pre">AttentionType</span></code></a><ul class="nav section-nav flex-column">
 <li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionType8kDEFAULTE"><code class="docutils literal notranslate"><span class="pre">kDEFAULT</span></code></a></li>
@@ -11872,10 +12779,13 @@
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi"><code class="docutils literal notranslate"><span class="pre">CommState()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi"><code class="docutils literal notranslate"><span class="pre">CommState()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE"><code class="docutils literal notranslate"><span class="pre">CommState()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi"><code class="docutils literal notranslate"><span class="pre">CommState()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv"><code class="docutils literal notranslate"><span class="pre">isMpiState()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13isSocketStateEv"><code class="docutils literal notranslate"><span class="pre">isSocketState()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState12isAgentStateEv"><code class="docutils literal notranslate"><span class="pre">isAgentState()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv"><code class="docutils literal notranslate"><span class="pre">getMpiState()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState14getSocketStateEv"><code class="docutils literal notranslate"><span class="pre">getSocketState()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13getAgentStateEv"><code class="docutils literal notranslate"><span class="pre">getAgentState()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv"><code class="docutils literal notranslate"><span class="pre">getSelfIdx()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommStateeqERK9CommState"><code class="docutils literal notranslate"><span class="pre">operator==()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState8toStringEv"><code class="docutils literal notranslate"><span class="pre">toString()</span></code></a></li>
@@ -12012,6 +12922,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_cpp_gen/runtime.html b/latest/_cpp_gen/runtime.html
index f354db2294..a6ee809136 100644
--- a/latest/_cpp_gen/runtime.html
+++ b/latest/_cpp_gen/runtime.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -2190,6 +2194,18 @@
 <span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32">
+<span id="_CPPv3NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32"></span><span id="tensorrt_llm::runtime::ModelConfig::getFirstLocalLayer__SizeType32.SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1ModelConfig_1a2c819f0d4717a6ad56c2f701f0ff1698"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getFirstLocalLayer</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelism</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelismRank</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp function">
 <dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32">
 <span id="_CPPv3NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32"></span><span id="tensorrt_llm::runtime::ModelConfig::countLowerRankLayers__LayerType.SizeType32.SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1ModelConfig_1a4c9cabd1675a0db58bce743a0ac0470e"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">countLowerRankLayers</span></span></span><span class="sig-paren">(</span>
@@ -2204,8 +2220,15 @@
 <dd></dd></dl>
 
 <dl class="cpp function">
-<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32">
-<span id="_CPPv3NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32"></span><span id="tensorrt_llm::runtime::ModelConfig::getNbLayers__SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1ModelConfig_1aefd69a08c1409f90a4e948d857cc08b1"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getNbLayers</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelism</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">1</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32" title="Link to this definition">#</a><br /></dt>
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32">
+<span id="_CPPv3NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32"></span><span id="tensorrt_llm::runtime::ModelConfig::getNbLayers__SizeType32.SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1ModelConfig_1aba756e17c1d83a61adf10f12a3787479"></span><span class="k"><span class="pre">inline</span></span><span class="w"> </span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getNbLayers</span></span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelism</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">pipelineParallelismRank</span></span><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="m"><span class="pre">0</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
 <dl class="cpp function">
@@ -11199,6 +11222,19 @@ one more than decoding draft tokens for prediction from primary head </p>
 </dl>
 </dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32">
+<span id="_CPPv3NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32"></span><span id="_CPPv2NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32"></span><span id="tensorrt_llm::runtime::decoder::DecoderState::getSequenceLengths__SizeType32C"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1decoder_1_1DecoderState_1ad9521ae6439b0704412f786c854c9145"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState9TensorPtrE" title="tensorrt_llm::runtime::decoder::DecoderState::TensorPtr"><span class="n"><span class="pre">TensorPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getSequenceLengths</span></span></span><span class="sig-paren">(</span><em class="sig-param"><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="n sig-param"><span class="pre">batchIdx</span></span></em><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32" title="Link to this definition">#</a><br /></dt>
+<dd><dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><p><strong>batchIdx</strong> – index of the batch </p>
+</dd>
+<dt class="field-even">Returns<span class="colon">:</span></dt>
+<dd class="field-even"><p>[maxBeamWidth], sequence lengths for request <code class="docutils literal notranslate"><span class="pre">batchIdx</span></code>, on gpu </p>
+</dd>
+</dl>
+</dd></dl>
+
 <dl class="cpp function">
 <dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv">
 <span id="_CPPv3NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv"></span><span id="_CPPv2NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv"></span><span id="tensorrt_llm::runtime::decoder::DecoderState::getAllNewTokensC"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1decoder_1_1DecoderState_1a1313811f8c18a59d45a542374ee5f6df"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState9TensorPtrE" title="tensorrt_llm::runtime::decoder::DecoderState::TensorPtr"><span class="n"><span class="pre">TensorPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getAllNewTokens</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv" title="Link to this definition">#</a><br /></dt>
@@ -11270,6 +11306,11 @@ one more than decoding draft tokens for prediction from primary head </p>
 </dl>
 </dd></dl>
 
+<dl class="cpp function">
+<dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv">
+<span id="_CPPv3NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv"></span><span id="_CPPv2NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv"></span><span id="tensorrt_llm::runtime::decoder::DecoderState::getMaxBatchSizeC"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1decoder_1_1DecoderState_1afa651d891bae6694a10aa7288c3724d9"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getMaxBatchSize</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp function">
 <dt class="sig sig-object cpp" id="_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv">
 <span id="_CPPv3NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv"></span><span id="_CPPv2NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv"></span><span id="tensorrt_llm::runtime::decoder::DecoderState::getMaxBeamWidthC"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1decoder_1_1DecoderState_1affb5c3e06a18f4e511a8f2662ed59013"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime10SizeType32E" title="tensorrt_llm::runtime::SizeType32"><span class="n"><span class="pre">SizeType32</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">getMaxBeamWidth</span></span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><span class="w"> </span><span class="k"><span class="pre">const</span></span><a class="headerlink" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv" title="Link to this definition">#</a><br /></dt>
@@ -11500,6 +11541,11 @@ one more than decoding draft tokens for prediction from primary head </p>
 <span id="_CPPv3N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE"></span><span id="_CPPv2N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE"></span><span id="tensorrt_llm::runtime::AllReduceBuffers::mAllReduceCommPtrs__TensorPtr"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1AllReduceBuffers_1ab48e63279d11f42d71c3621820d2520c"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE" title="tensorrt_llm::runtime::AllReduceBuffers::TensorPtr"><span class="n"><span class="pre">TensorPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mAllReduceCommPtrs</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE" title="Link to this definition">#</a><br /></dt>
 <dd></dd></dl>
 
+<dl class="cpp var">
+<dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE">
+<span id="_CPPv3N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE"></span><span id="_CPPv2N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE"></span><span id="tensorrt_llm::runtime::AllReduceBuffers::mFlagPtrs__TensorPtr"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1AllReduceBuffers_1a304f00427fcda4b28d5b235fef1a544c"></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE" title="tensorrt_llm::runtime::AllReduceBuffers::TensorPtr"><span class="n"><span class="pre">TensorPtr</span></span></a><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mFlagPtrs</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE" title="Link to this definition">#</a><br /></dt>
+<dd></dd></dl>
+
 <dl class="cpp var">
 <dt class="sig sig-object cpp" id="_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE">
 <span id="_CPPv3N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE"></span><span id="_CPPv2N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE"></span><span id="tensorrt_llm::runtime::AllReduceBuffers::mIpcMemoryHandles__std::vector:runtime::IpcMemory:"></span><span class="target" id="classtensorrt__llm_1_1runtime_1_1AllReduceBuffers_1a162c983f7dc981a8c4af57510637e767"></span><span class="n"><span class="pre">std</span></span><span class="p"><span class="pre">::</span></span><span class="n"><span class="pre">vector</span></span><span class="p"><span class="pre">&lt;</span></span><a class="reference internal" href="executor.html#_CPPv4N12tensorrt_llm7runtimeE" title="tensorrt_llm::runtime"><span class="n"><span class="pre">runtime</span></span></a><span class="p"><span class="pre">::</span></span><a class="reference internal" href="#_CPPv4N12tensorrt_llm7runtime9IpcMemoryE" title="tensorrt_llm::runtime::IpcMemory"><span class="n"><span class="pre">IpcMemory</span></span></a><span class="p"><span class="pre">&gt;</span></span><span class="w"> </span><span class="sig-name descname"><span class="n"><span class="pre">mIpcMemoryHandles</span></span></span><a class="headerlink" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE" title="Link to this definition">#</a><br /></dt>
@@ -12171,8 +12217,9 @@ one more than decoding draft tokens for prediction from primary head </p>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getVocabSizeEv"><code class="docutils literal notranslate"><span class="pre">getVocabSize()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getVocabSizePaddedE10SizeType32"><code class="docutils literal notranslate"><span class="pre">getVocabSizePadded()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">countLocalLayers()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">getFirstLocalLayer()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">countLowerRankLayers()</span></code></a></li>
-<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32"><code class="docutils literal notranslate"><span class="pre">getNbLayers()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">getNbLayers()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">getNbAttentionLayers()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32"><code class="docutils literal notranslate"><span class="pre">getNbRnnLayers()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime11ModelConfig10getNbHeadsEv"><code class="docutils literal notranslate"><span class="pre">getNbHeads()</span></code></a></li>
@@ -13526,6 +13573,7 @@ one more than decoding draft tokens for prediction from primary head </p>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsEv"><code class="docutils literal notranslate"><span class="pre">getLogProbs()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsE10SizeType32"><code class="docutils literal notranslate"><span class="pre">getLogProbs()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsEv"><code class="docutils literal notranslate"><span class="pre">getSequenceLengths()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32"><code class="docutils literal notranslate"><span class="pre">getSequenceLengths()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv"><code class="docutils literal notranslate"><span class="pre">getAllNewTokens()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getNextDraftTokensEv"><code class="docutils literal notranslate"><span class="pre">getNextDraftTokens()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getPrevDraftTokensLengthsEv"><code class="docutils literal notranslate"><span class="pre">getPrevDraftTokensLengths()</span></code></a></li>
@@ -13533,6 +13581,7 @@ one more than decoding draft tokens for prediction from primary head </p>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24getAcceptedLengthsCumSumEv"><code class="docutils literal notranslate"><span class="pre">getAcceptedLengthsCumSum()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getAcceptedPackedPathsEv"><code class="docutils literal notranslate"><span class="pre">getAcceptedPackedPaths()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishedStepsEv"><code class="docutils literal notranslate"><span class="pre">getFinishedSteps()</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv"><code class="docutils literal notranslate"><span class="pre">getMaxBatchSize()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv"><code class="docutils literal notranslate"><span class="pre">getMaxBeamWidth()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getMaxSequenceLengthEv"><code class="docutils literal notranslate"><span class="pre">getMaxSequenceLength()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState27getMaxDecodingDecoderTokensEv"><code class="docutils literal notranslate"><span class="pre">getMaxDecodingDecoderTokens()</span></code></a></li>
@@ -13566,6 +13615,7 @@ one more than decoding draft tokens for prediction from primary head </p>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE"><code class="docutils literal notranslate"><span class="pre">TensorPtr</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb"><code class="docutils literal notranslate"><span class="pre">AllReduceBuffers()</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE"><code class="docutils literal notranslate"><span class="pre">mAllReduceCommPtrs</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE"><code class="docutils literal notranslate"><span class="pre">mFlagPtrs</span></code></a></li>
 <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE"><code class="docutils literal notranslate"><span class="pre">mIpcMemoryHandles</span></code></a></li>
 </ul>
 </li>
@@ -13717,6 +13767,15 @@ one more than decoding draft tokens for prediction from primary head </p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
new file mode 100644
index 0000000000..32dcea9fff
--- /dev/null
+++ b/latest/_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py
@@ -0,0 +1,1081 @@
+import math
+import weakref
+from enum import IntEnum
+from typing import Optional, Union, cast
+
+import torch
+from torch import nn
+
+from tensorrt_llm.mapping import Mapping
+
+from ..attention_backend import (AttentionInputType, AttentionMetadata,
+                                 TrtllmAttention, TrtllmAttentionMetadata)
+from ..attention_backend.interface import (PositionalEmbeddingParams,
+                                           PredefinedAttentionMask)
+from ..attention_backend.utils import create_attention, get_attention_backend
+from ..distributed import AllReduceParams
+from ..model_config import ModelConfig
+from ..peft.lora.layer import LoraLayer, LoraModuleType
+from ..utils import Fp4QuantizedTensor, get_model_extra_attrs
+from .linear import Linear, TensorParallelMode, WeightMode, WeightsLoadingConfig
+from .multi_stream_utils import maybe_execute_in_parallel
+from .rms_norm import RMSNorm
+from .rotary_embedding import RotaryEmbedding
+
+
+class QkNormType(IntEnum):
+    """
+    The type of QK normalization.
+    """
+    none = 0  # No normalization applied to Q and K
+    pre_rope = 1  # Apply normalization before Rope
+    post_rope = 2  # Apply normalization after Rope
+
+
+class Attention(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        max_position_embeddings: int,
+        bias: bool,
+        pos_embd_params: Optional[PositionalEmbeddingParams] = None,
+        qk_norm_type: QkNormType = QkNormType.none,
+        layer_idx: Optional[int] = None,
+        dtype: torch.dtype = None,
+        dense_bias: Optional[bool] = None,
+        config: Optional[ModelConfig] = None,
+        q_scaling: float = 1.0,
+        attention_chunk_size: Optional[int] = None,
+    ):
+        """
+        Initialize the Attention module.
+
+        Args:
+            hidden_size (int): The size of the hidden dimension.
+            num_attention_heads (int): The number of attention heads.
+            num_key_value_heads (int): The number of key value heads.
+            max_position_embeddings (int): The maximum position embeddings.
+            bias (bool): Whether to use bias in the linear layers.
+            pos_embd_params (PositionalEmbeddingParams): The positional embedding parameters.
+            qk_norm_type (QkNormType): The type of QK normalization.
+            layer_idx (int): The layer index.
+            dtype (torch.dtype): The data type.
+            dense_bias (bool): Whether to use bias in the output projection layer.
+            config (ModelConfig): The model configuration.
+            q_scaling (float): The scaling factor for the qk_scale. The definition is $O = softmax(QK^T * qk_scale) * V, qk_scale = 1 / (sqrt(head_dim) * q_scaling)$. The default value is 1.0.
+            attention_chunk_size (int): See [Chunked Attention] below.
+        """
+        super().__init__()
+        self.layer_idx = layer_idx
+
+        config = config or ModelConfig()
+        self.hidden_size = hidden_size
+        self.num_heads = num_attention_heads
+        self.head_dim = getattr(config.pretrained_config, "head_dim",
+                                self.hidden_size // self.num_heads)
+        self.num_key_value_heads = num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.pos_embd_params = pos_embd_params
+        self.qk_norm_type = qk_norm_type
+        self.dense_bias = dense_bias
+        self.q_scaling = q_scaling
+
+        # [Chunked Attention]
+        # Chunked attention is applied to context requests only. Chunked attention will be
+        # applied when this field is specified and mMaskType == CAUSAL.
+        #
+        # In chunked attention, we break context requests into chunks of a specified size. Tokens can only
+        # attend to tokens in the same chunk. So, for example, if the chunk size is 3, we might have a mask
+        # that looks like this:
+        #
+        # 1 0 0 0 0 0
+        # 1 1 0 0 0 0
+        # 1 1 1 0 0 0
+        # 0 0 0 1 0 0
+        # 0 0 0 1 1 0
+        # 0 0 0 1 1 1
+        self.attention_chunk_size = attention_chunk_size
+
+        if dense_bias is None:
+            self.dense_bias = bias
+
+        # tensor parallel
+        tp_size = config.mapping.tp_size
+        pp_size = config.mapping.pp_size
+        if config.mapping.enable_attention_dp:
+            tp_size = 1
+
+        mapping = Mapping(
+            world_size=tp_size * pp_size,
+            tp_size=tp_size,
+            pp_size=pp_size,
+            rank=config.mapping.rank,
+            gpus_per_node=config.mapping.gpus_per_node,
+            enable_attention_dp=config.mapping.enable_attention_dp,
+        )
+        assert self.num_heads % tp_size == 0
+        self.num_heads = self.num_heads // tp_size
+        self.num_key_value_heads = (self.num_key_value_heads + tp_size -
+                                    1) // tp_size
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_key_value_heads * self.head_dim
+
+        self.qkv_proj = Linear(
+            self.hidden_size,
+            tp_size * self.q_size + 2 * tp_size * self.kv_size,
+            bias=bias,
+            dtype=dtype,
+            mapping=mapping,
+            tensor_parallel_mode=TensorParallelMode.COLUMN,
+            weights_loading_config=WeightsLoadingConfig(
+                weight_mode=WeightMode.FUSED_QKV_LINEAR),
+            quant_config=config.get_quant_config(),
+            skip_create_weights_in_init=config.skip_create_weights_in_init,
+        )
+        self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE],
+                                [self.hidden_size])
+
+        self.o_proj = Linear(
+            tp_size * self.q_size,
+            self.hidden_size,
+            bias=self.dense_bias,
+            dtype=dtype,
+            mapping=mapping,
+            tensor_parallel_mode=TensorParallelMode.ROW,
+            quant_config=config.get_quant_config(),
+            skip_create_weights_in_init=config.skip_create_weights_in_init,
+            lora=self.o_lora,
+        )
+
+        self.quant_config = config.get_quant_config()
+        self.attn_backend = config.attn_backend
+        attn_cls = get_attention_backend(self.attn_backend)
+
+        # These two modules are mutually exclusive - either splitted_qkv_lora or fused_qkv_lora will be used,
+        # but never both at the same time. splitted_qkv_lora handles Q,K,V separately while fused_qkv_lora
+        # handles them as a single fused operation.
+        self.splitted_qkv_lora = LoraLayer([
+            LoraModuleType.ATTENTION_Q, LoraModuleType.ATTENTION_K,
+            LoraModuleType.ATTENTION_V
+        ], [self.q_size, self.kv_size, self.kv_size])
+        self.fused_qkv_lora = LoraLayer([LoraModuleType.ATTENTION_QKV],
+                                        [self.q_size + 2 * self.kv_size])
+
+        self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE],
+                                [self.hidden_size])
+
+        # enable_rope_fusion: Whether to fuse RoPE into the attention OP.
+        # If true, RoPE will be applied in self.attn.forward.
+        # If false, RoPE will be applied in self.apply_rope.
+        self.enable_rope_fusion = attn_cls.support_fused_rope(
+        ) and self.qk_norm_type != QkNormType.post_rope
+
+        self.rotary_emb = None
+        if not self.enable_rope_fusion and self.pos_embd_params is not None:
+            self.rotary_emb = RotaryEmbedding(
+                self.pos_embd_params.rope,
+                head_dim=self.head_dim,
+                is_neox=self.pos_embd_params.is_neox,
+            )
+
+        self.attn = create_attention(
+            self.attn_backend,
+            self.layer_idx,
+            self.num_heads,
+            self.head_dim,
+            self.num_key_value_heads,
+            pos_embd_params=self.pos_embd_params
+            if self.enable_rope_fusion else None,
+            quant_config=self.quant_config,
+            skip_create_weights_in_init=config.skip_create_weights_in_init,
+            q_scaling=self.q_scaling,
+            attention_chunk_size=self.attention_chunk_size,
+        )
+
+        self.support_fused_qkv = self.attn.support_fused_qkv()
+
+        if not config.skip_create_weights_in_init:
+            self.create_weights()
+
+    def create_weights(self):
+        # self.attn has no weights but has states that are related to quant_config,
+        # which could be modified after __init__
+        self.attn.update_quant_config(self.quant_config)
+
+    def split_qkv(self, q, k=None, v=None):
+        if k is None and v is None:
+            q, k, v = q.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        return q, k, v
+
+    def convert_qkv(self, q, k, v):
+        if k is None and v is None and not self.support_fused_qkv:
+            q, k, v = self.split_qkv(q)
+        elif k is not None and v is not None and self.support_fused_qkv:
+            qkv = torch.concat([q, k, v], dim=-1)
+            q, k, v = qkv, None, None
+        return q, k, v
+
+    def forward(
+        self,
+        position_ids: Optional[torch.LongTensor],
+        hidden_states: Union[torch.Tensor, Fp4QuantizedTensor],
+        attn_metadata: AttentionMetadata,
+        attention_mask: PredefinedAttentionMask = PredefinedAttentionMask.
+        CAUSAL,
+        mrope_config: Optional[dict] = None,
+        all_reduce_params: Optional[AllReduceParams] = None,
+        lora_params: Optional[dict] = None,
+        attention_window_size: Optional[int] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Forward pass for the Attention module.
+
+        Args:
+            position_ids (Optional[torch.LongTensor]): The position IDs.
+            hidden_states (torch.Tensor): The hidden states.
+            attn_metadata (AttentionMetadata): The attention metadata.
+            attention_mask (PredefinedAttentionMask): The attention mask type.
+            mrope_config (Optional[dict]): The MROPE configuration.
+            all_reduce_params (Optional[AllReduceParams]): The all reduce parameters.
+            lora_params (Optional[dict]): The LoRA parameters.
+            attention_window_size (Optional[int]): The attention window size.
+
+        Returns:
+            torch.Tensor: The output tensor.
+        """
+        qkv = self.qkv_proj(hidden_states)
+
+        if bool(lora_params):
+            qkv_lora = self.splitted_qkv_lora(hidden_states, lora_params,
+                                              self.layer_idx)
+            if qkv_lora is not None:
+                qkv = qkv + qkv_lora
+
+            qkv_lora = self.fused_qkv_lora(hidden_states, lora_params,
+                                           self.layer_idx)
+            if qkv_lora is not None:
+                qkv = qkv + qkv_lora
+
+        q, k, v = self.apply_rope(qkv, position_ids)
+
+        out_scale = None
+        if self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4 or self.o_proj.has_fp8_block_scales:
+            out_scale = self.o_proj.inv_input_scale
+
+        q, k, v = self.convert_qkv(q, k, v)
+        attn_output = self.attn.forward(
+            q,
+            k,
+            v,
+            attn_metadata,
+            out_scale=out_scale,
+            attention_mask=attention_mask,
+            mrope_config=mrope_config,
+            attention_window_size=attention_window_size)
+        hidden_states = attn_output
+        attn_output = self.o_proj(attn_output,
+                                  all_reduce_params=all_reduce_params,
+                                  lora_params=lora_params,
+                                  layer_idx=self.layer_idx)
+        return attn_output
+
+    def apply_qk_norm(self, q, k):
+        raise NotImplementedError(
+            f"QK norm is not implemented for {self.__class__.__name__}."
+            "Please override the `apply_qk_norm` method in the subclass.")
+
+    def apply_rope(self, qkv: torch.Tensor, position_ids: torch.Tensor):
+        """
+        Apply RoPE to the query and key, possibly including QK norm.
+        Args:
+            qkv (torch.Tensor): The query, key, and value tensor.
+            position_ids (torch.Tensor): The position IDs of each token for RoPE.
+        Returns:
+            tuple: A tuple of (q, k, v).
+            This method could be overridden in the subclass, it is possible that k/v is None and q is the concatenated qkv tensor, up to the implementation.
+            Before self.attn.forward, convert_qkv will be called to make sure that the format of (q, k, v) satisfies the requirement of self.attn.
+        """
+        q, k, v = qkv, None, None
+        if self.qk_norm_type == QkNormType.pre_rope:
+            q, k, v = self.split_qkv(q, k, v)
+            q, k = self.apply_qk_norm(q, k)
+        if not self.enable_rope_fusion and position_ids is not None:
+            q, k, v = self.split_qkv(q, k, v)
+            q, k = self.rotary_emb(position_ids, [q, k])
+            if self.qk_norm_type == QkNormType.post_rope:
+                q, k = self.apply_qk_norm(q, k)
+
+        return q, k, v
+
+
+def extract_extra_attrs(layer_idx: str):
+    extra_attrs = get_model_extra_attrs()
+    assert extra_attrs is not None, "Model extra attrs is not set"
+
+    metadata_ref = extra_attrs.get("attention_metadata", None)
+    assert metadata_ref is not None, "Attention metadata is not set"
+    metadata = metadata_ref()
+    assert isinstance(
+        metadata,
+        TrtllmAttentionMetadata,
+    )
+
+    mla_layers = extra_attrs.get("mla_layers", None)
+    assert mla_layers is not None, "MLA layers is not registered"
+    mla_layer_ref = mla_layers.get(layer_idx, None)
+    assert mla_layer_ref is not None, f"Cannot find MLA layer for layer {layer_idx}"
+    mla_layer = mla_layer_ref()
+    assert isinstance(
+        mla_layer,
+        MLA), "MLA layer must be a subclass of MLA or an instance of MLA"
+
+    return metadata, mla_layer
+
+
+@torch.library.custom_op("trtllm::mla_custom_op", mutates_args=())
+def mla_custom_op(
+    position_ids: Optional[torch.Tensor],
+    hidden_states: torch.Tensor,
+    layer_idx: str,
+) -> torch.Tensor:
+    metadata, mla_layer = extract_extra_attrs(layer_idx)
+
+    return mla_layer.forward_impl(position_ids, hidden_states, metadata)
+
+
+@mla_custom_op.register_fake
+def _(position_ids, hidden_states, layer_idx):
+    _, mla_layer = extract_extra_attrs(layer_idx)
+    return mla_layer.forward_impl_fake(hidden_states)
+
+
+class MLA(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_key_value_heads: int,
+        qk_nope_head_dim: int,
+        qk_rope_head_dim: int,
+        v_head_dim: int,
+        q_lora_rank: int,
+        kv_lora_rank: int,
+        predicted_tokens_per_seq: int,
+        max_position_embeddings: int,
+        bias: bool,
+        aux_stream: Optional[torch.cuda.Stream] = None,
+        pos_embd_params: Optional[PositionalEmbeddingParams] = None,
+        layer_idx: Optional[int] = None,
+        dtype: torch.dtype = None,
+        dense_bias: Optional[bool] = None,
+        config: Optional[ModelConfig] = None,
+    ):
+        """
+        Initialize the MLA module.
+
+        Args:
+            hidden_size (int): The size of the hidden dimension.
+            num_attention_heads (int): The number of attention heads.
+            num_key_value_heads (int): The number of key value heads.
+            qk_nope_head_dim (int): The dimension of the query and key without Rope.
+            qk_rope_head_dim (int): The dimension of the Rope of query and key.
+            v_head_dim (int): The dimension of the value.
+            q_lora_rank (int): The dimension of the compressed query.
+            kv_lora_rank (int): The dimension of the compressed key and value.
+            predicted_tokens_per_seq (int): The number of predicted tokens per sequence.
+            max_position_embeddings (int): The maximum position embeddings.
+            bias (bool): Whether to use bias in the linear layers.
+            aux_stream (Optional[torch.cuda.Stream]): The auxiliary CUDA stream for running operations in two parallel streams.
+            pos_embd_params (PositionalEmbeddingParams): The positional embedding parameters.
+            layer_idx (int): The layer index.
+            dtype (torch.dtype): The data type.
+            dense_bias (bool): Whether to use bias in the output projection layer.
+            config (ModelConfig): The model configuration.
+        """
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.layer_idx_str = str(layer_idx)
+        self.dtype = dtype
+
+        self.hidden_size = hidden_size
+        self.num_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.q_lora_rank = q_lora_rank
+        self.kv_lora_rank = kv_lora_rank
+        self.predicted_tokens_per_seq = predicted_tokens_per_seq
+        self.max_position_embeddings = max_position_embeddings
+        self.pos_embd_params = pos_embd_params
+        self.dense_bias = dense_bias
+        if dense_bias is None:
+            self.dense_bias = bias
+
+        if self.q_lora_rank is None:
+            self.q_lora_rank = hidden_size
+            self.is_lite = True
+        else:
+            self.is_lite = False
+
+        assert pos_embd_params is not None, "pos_embd_params must be provided in MLA"
+
+        self.register_to_config = False
+        if config is not None:
+            if "mla_layers" not in config.extra_attrs:
+                config.extra_attrs["mla_layers"] = {}
+            config.extra_attrs["mla_layers"][self.layer_idx_str] = weakref.ref(
+                self)
+            self.register_to_config = True
+
+        # tensor parallel
+        config = config or ModelConfig()
+        tp_size = config.mapping.tp_size
+        pp_size = config.mapping.pp_size
+        if config.mapping.enable_attention_dp:
+            tp_size = 1
+
+        mapping = Mapping(
+            world_size=tp_size * pp_size,
+            tp_size=tp_size,
+            pp_size=pp_size,
+            rank=config.mapping.rank,
+            gpus_per_node=config.mapping.gpus_per_node,
+            enable_attention_dp=config.mapping.enable_attention_dp,
+        )
+
+        assert self.num_heads % tp_size == 0
+        self.num_heads = self.num_heads // tp_size
+        self.num_key_value_heads = (self.num_key_value_heads + tp_size -
+                                    1) // tp_size
+
+        rms_norm_eps = config.pretrained_config.rms_norm_eps
+        quant_config = config.get_quant_config()
+        self.quant_config = quant_config
+
+        if not self.is_lite:
+            self.fused_a = Linear(
+                hidden_size,
+                self.q_lora_rank + self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=bias,
+                dtype=dtype,
+                quant_config=quant_config,
+                skip_create_weights_in_init=config.skip_create_weights_in_init,
+                use_custom_cublas_mm=True)
+
+            self.q_a_layernorm = RMSNorm(hidden_size=self.q_lora_rank,
+                                         eps=rms_norm_eps,
+                                         dtype=dtype)
+
+            self.q_b_proj = Linear(
+                self.q_lora_rank,
+                tp_size * self.num_heads * self.qk_head_dim,
+                bias=bias,
+                dtype=dtype,
+                mapping=mapping,
+                tensor_parallel_mode=TensorParallelMode.COLUMN,
+                quant_config=quant_config,
+                skip_create_weights_in_init=config.skip_create_weights_in_init)
+        else:
+            self.fused_a = Linear(
+                hidden_size,
+                self.kv_lora_rank + self.qk_rope_head_dim,
+                bias=bias,
+                dtype=dtype,
+                quant_config=quant_config,
+                skip_create_weights_in_init=config.skip_create_weights_in_init,
+                use_custom_cublas_mm=True)
+
+            self.q_proj = Linear(
+                self.q_lora_rank,
+                tp_size * self.num_heads * self.qk_head_dim,
+                bias=bias,
+                dtype=dtype,
+                mapping=mapping,
+                tensor_parallel_mode=TensorParallelMode.COLUMN,
+                quant_config=quant_config,
+                skip_create_weights_in_init=config.skip_create_weights_in_init,
+            )
+            self.q_b_proj = self.q_proj
+
+        self.kv_a_layernorm = RMSNorm(hidden_size=kv_lora_rank,
+                                      dtype=dtype,
+                                      eps=rms_norm_eps)
+
+        self.kv_b_proj = Linear(
+            self.kv_lora_rank,
+            tp_size * self.num_heads *
+            (self.qk_nope_head_dim + self.v_head_dim),
+            bias=bias,
+            dtype=dtype,
+            mapping=mapping,
+            tensor_parallel_mode=TensorParallelMode.COLUMN,
+            quant_config=quant_config,
+            skip_create_weights_in_init=config.skip_create_weights_in_init)
+        # This parameter will view into self.kv_b_proj.weight after loading weights.
+        # For dummy weight initialization, this parameter is initialized with empty tensor.
+        # Used in forward_generation only
+        self.v_b_proj = nn.Parameter(
+            torch.empty(
+                (self.num_heads, self.v_head_dim, self.kv_lora_rank),
+                dtype=dtype,
+            ),
+            requires_grad=False,
+        )
+
+        self.o_proj = Linear(
+            self.num_key_value_heads * self.v_head_dim * tp_size,
+            self.hidden_size,
+            bias=self.dense_bias,
+            dtype=dtype,
+            mapping=mapping,
+            tensor_parallel_mode=TensorParallelMode.ROW,
+            quant_config=quant_config,
+            skip_create_weights_in_init=config.skip_create_weights_in_init,
+        )
+
+        def yarn_get_mscale(scale=1, mscale=1):
+            if scale <= 1:
+                return 1.0
+            return 0.1 * mscale * math.log(scale) + 1.0
+
+        mscale_all_dim = pos_embd_params.rope.mscale_all_dim
+        scaling_factor = pos_embd_params.rope.scale
+        mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+        q_scaling = 1.0 / (mscale * mscale)
+
+        self.mha = create_attention(
+            config.attn_backend,
+            self.layer_idx,
+            self.num_heads,
+            head_dim=self.qk_head_dim,
+            num_kv_heads=self.num_key_value_heads,
+            pos_embd_params=pos_embd_params,
+            quant_config=quant_config,
+            q_scaling=q_scaling,
+            is_mla_enable=True,
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            v_head_dim=self.v_head_dim,
+            predicted_tokens_per_seq=self.predicted_tokens_per_seq,
+            skip_create_weights_in_init=config.skip_create_weights_in_init,
+        )
+
+        self.mqa = create_attention(
+            config.attn_backend,
+            self.layer_idx,
+            self.num_heads,
+            head_dim=self.kv_lora_rank + self.qk_rope_head_dim,
+            num_kv_heads=1,
+            pos_embd_params=pos_embd_params,
+            quant_config=quant_config,
+            q_scaling=q_scaling,
+            is_mla_enable=True,
+            q_lora_rank=self.q_lora_rank,
+            kv_lora_rank=self.kv_lora_rank,
+            qk_nope_head_dim=self.qk_nope_head_dim,
+            qk_rope_head_dim=self.qk_rope_head_dim,
+            v_head_dim=self.kv_lora_rank,
+            predicted_tokens_per_seq=self.predicted_tokens_per_seq,
+            skip_create_weights_in_init=config.skip_create_weights_in_init,
+        )
+
+        self.aux_stream = aux_stream
+        self.ln_events = [torch.cuda.Event(), torch.cuda.Event()]
+
+        self.enable_rope_fusion = self.mha.support_fused_rope()
+        self.support_fused_qkv = self.mha.support_fused_qkv()
+        self.rotary_emb = RotaryEmbedding(
+            pos_embd_params.rope,
+            head_dim=self.qk_rope_head_dim,
+            is_neox=pos_embd_params.is_neox,
+        )
+        self.apply_rotary_emb = not self.enable_rope_fusion
+
+        if not config.skip_create_weights_in_init:
+            self.create_weights()
+
+    def create_weights(self):
+        # self.mha/mqa has no weights but has states that are related to quant_config,
+        # which could be modified after __init__
+        self.mha.update_quant_config(self.quant_config)
+        self.mqa.update_quant_config(self.quant_config)
+
+        # k_b_proj_trans's dtype must be consistent with self.kv_b_proj,
+        # which can be modified after __init__
+        has_fp8_block_scales = (
+            self.kv_b_proj.quant_config
+            and self.kv_b_proj.quant_config.quant_mode.has_fp8_block_scales())
+
+        mla_weight_dtype = torch.float8_e4m3fn if has_fp8_block_scales else self.dtype
+        self.k_b_proj_trans = nn.Parameter(
+            torch.empty(
+                (self.num_heads, self.kv_lora_rank, self.qk_nope_head_dim),
+                dtype=mla_weight_dtype,
+            ),
+            requires_grad=False,
+        )
+
+        if has_fp8_block_scales:
+            self.k_b_proj_trans_scale = nn.Parameter(
+                torch.empty(
+                    (
+                        self.num_heads,
+                        self.kv_lora_rank // 128,
+                        self.qk_nope_head_dim // 128,
+                    ),
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+            # This parameter will view into self.kv_b_proj.weight_scale after loading weights.
+            # For dummy weight initialization, this parameter is initialized with empty tensor.
+            self.v_b_proj_scale = nn.Parameter(
+                torch.empty(
+                    (
+                        self.num_heads,
+                        self.v_head_dim // 128,
+                        self.kv_lora_rank // 128,
+                    ),
+                    dtype=torch.float32,
+                ),
+                requires_grad=False,
+            )
+        else:
+            self.k_b_proj_trans_scale = None
+            self.v_b_proj_scale = None
+
+    def apply_rope(
+        self,
+        q: torch.Tensor,
+        k_pe: torch.Tensor,
+        position_ids: torch.Tensor,
+    ) -> torch.Tensor:
+        q = q.view(-1, self.num_heads, self.qk_head_dim)
+        q_pe = q[..., self.qk_nope_head_dim:].reshape(
+            -1, self.num_heads * self.qk_rope_head_dim)
+        q_pe, k_pe = self.rotary_emb(position_ids, [q_pe, k_pe])
+        q[..., self.qk_nope_head_dim:] = q_pe.view(-1, self.num_heads,
+                                                   self.qk_rope_head_dim)
+        return k_pe
+
+    def forward_impl_fake(self, hidden_states: torch.Tensor):
+        num_tokens = hidden_states.shape[0]
+        hidden_size = self.o_proj.in_features
+        return hidden_states.new_empty([num_tokens, hidden_size],
+                                       dtype=hidden_states.dtype)
+
+    def forward_impl(
+        self,
+        position_ids: Optional[torch.Tensor],
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        """
+        Forward pass for the MLA module.
+
+        Args:
+            position_ids (Optional[torch.LongTensor]): The position IDs.
+            hidden_states (torch.Tensor): The hidden states.
+            attn_metadata (AttentionMetadata): The attention metadata.
+            all_reduce_params (Optional[AllReduceParams]): The all reduce parameters.
+
+        Returns:
+            torch.Tensor: The output tensor.
+        """
+        if self.is_lite:
+            compressed_kv, k_pe = self.fused_a(hidden_states).split(
+                [self.kv_lora_rank, self.qk_rope_head_dim], -1)
+            compressed_kv = self.kv_a_layernorm(compressed_kv)
+            q = hidden_states
+        else:
+            q, compressed_kv, k_pe = self.fused_a(hidden_states).split(
+                [self.q_lora_rank, self.kv_lora_rank, self.qk_rope_head_dim],
+                -1)
+
+            q, compressed_kv = maybe_execute_in_parallel(
+                lambda: self.q_a_layernorm(q),
+                lambda: self.kv_a_layernorm(compressed_kv),
+                self.ln_events[0],
+                self.ln_events[1],
+                self.aux_stream,
+            )
+
+        q, latent_cache = maybe_execute_in_parallel(
+            lambda: self.q_b_proj(q),
+            lambda: torch.concat([compressed_kv, k_pe], dim=-1),
+            self.ln_events[0],
+            self.ln_events[1],
+            self.aux_stream,
+        )
+
+        # split q, k, v into context and gen batches
+        num_contexts = attn_metadata.num_contexts
+        num_generations = attn_metadata.num_generations
+        num_ctx_tokens = attn_metadata.num_ctx_tokens
+        num_tokens = attn_metadata.num_tokens
+
+        assert q.shape[
+            0] == num_tokens, f"Expect q.shape[0] to be {num_tokens}, but got {q.shape[0]}"
+
+        if num_contexts > 0:
+            q_ctx = q[:num_ctx_tokens, ...]
+            compressed_kv_ctx = compressed_kv[:num_ctx_tokens, ...]
+            k_pe_ctx = k_pe[:num_ctx_tokens, ...]
+            latent_cache_ctx = latent_cache[:num_ctx_tokens, ...]
+            if self.apply_rotary_emb:
+                assert position_ids is not None
+                k_pe_ctx = self.apply_rope(q_ctx, k_pe_ctx, position_ids)
+
+            attn_output_context = self.forward_context(q_ctx, compressed_kv_ctx,
+                                                       k_pe_ctx, attn_metadata,
+                                                       latent_cache_ctx,
+                                                       position_ids)
+        else:
+            attn_output_context = None
+
+        if num_generations > 0:
+            q_gen = q[num_ctx_tokens:, ...]
+            compressed_kv_gen = compressed_kv[num_ctx_tokens:, ...]
+            k_pe_gen = k_pe[num_ctx_tokens:, ...]
+            latent_cache_gen = latent_cache[num_ctx_tokens:, ...]
+            if self.apply_rotary_emb:
+                assert position_ids is not None
+                k_pe_gen = self.apply_rope(q_gen, k_pe_gen, position_ids)
+
+            attn_output_gen = self.forward_generation(q_gen, compressed_kv_gen,
+                                                      k_pe_gen, attn_metadata,
+                                                      latent_cache_gen)
+        else:
+            attn_output_gen = None
+
+        # release pytorch activation memory
+        q = None
+        compressed_kv = None
+        k_pe = None
+
+        # merge context and gen batches
+        if attn_output_context is not None and attn_output_gen is not None:
+            assert (
+                len(attn_output_context.shape) == 2
+            ), f"attn_output_context must be rank 2, not {len(attn_output_context.shape)}"
+            assert (
+                len(attn_output_gen.shape) == 2
+            ), f"attn_output_gen must be rank 2, not {len(attn_output_gen.shape)}"
+            attn_output = torch.cat([attn_output_context, attn_output_gen],
+                                    dim=0)
+            # release pytorch activation memory
+            attn_output_context = None
+            attn_output_gen = None
+        elif attn_output_gen is None:
+            attn_output = attn_output_context
+        else:
+            attn_output = attn_output_gen
+
+        return attn_output
+
+    def _maybe_concat_qkv(self, q, k, v):
+        if k is not None and v is not None and self.support_fused_qkv:
+            qkv = torch.concat([q, k, v], dim=-1)
+            q, k, v = qkv, None, None
+        return q, k, v
+
+    def forward_context_default(
+        self,
+        q: torch.Tensor,
+        compressed_kv: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        latent_cache: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        kv = self.kv_b_proj(compressed_kv)
+        k_nope, v = kv.split(
+            [
+                self.num_heads * self.qk_nope_head_dim,
+                self.num_heads * self.v_head_dim
+            ],
+            -1,
+        )
+
+        k = torch.empty_like(q).view(-1, self.num_heads, self.qk_head_dim)
+        k[..., :self.qk_nope_head_dim] = k_nope.view(-1, self.num_heads,
+                                                     self.qk_nope_head_dim)
+        if self.apply_rotary_emb:
+            k[..., self.qk_nope_head_dim:] = k_pe.view(-1, 1,
+                                                       self.qk_rope_head_dim)
+        k = k.view(-1, self.num_heads * self.qk_head_dim)
+
+        # May concat q(including q_pe), k + k_pe, v together
+        q, k, v = self._maybe_concat_qkv(q, k, v)
+
+        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
+        out_scale = None  # Currently we use BF16 MHA for context phase
+
+        attn_output = self.mha.forward(
+            q,
+            k,
+            v,
+            attn_metadata,
+            attention_input_type=AttentionInputType.context_only,
+            latent_cache=latent_cache,
+            out_scale=out_scale,
+        )
+
+        return attn_output
+
+    def forward_context_with_cached_kv(
+        self,
+        q: torch.Tensor,
+        compressed_kv: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        trtllm_attention = cast(TrtllmAttention, self.mha)
+        # split current q into q_nope and q_pe
+        q_nope, q_pe = q.view([
+            -1, self.num_heads, self.qk_nope_head_dim + self.qk_rope_head_dim
+        ]).split([self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        # apply rope to current q_pe and k_pe
+        assert position_ids is not None
+        assert position_ids.dim() == 1 or (position_ids.dim() == 2
+                                           and position_ids.shape[0] == 1)
+        assert self.rotary_emb is not None
+        assert self.rotary_emb.head_dim == self.qk_rope_head_dim
+        assert q_pe.shape[0] == k_pe.shape[0]
+        q_pe = q_pe.contiguous().view(-1,
+                                      self.num_heads * self.qk_rope_head_dim)
+        q_pe, k_pe = self.rotary_emb(
+            position_ids[..., :attn_metadata.num_ctx_tokens], [q_pe, k_pe])
+        k_pe = k_pe.contiguous()
+
+        # build q for attention op
+        q_view = q.view(-1, self.num_heads,
+                        self.qk_nope_head_dim + self.qk_rope_head_dim)
+        q_view[:, :,
+               self.qk_nope_head_dim:] = q_pe.view(-1, self.num_heads,
+                                                   self.qk_rope_head_dim)
+        q = q_view.view(
+            -1,
+            self.num_heads * (self.qk_nope_head_dim + self.qk_rope_head_dim))
+        assert q.is_contiguous()
+
+        # append paged kv cache for mla
+        trtllm_attention.append_paged_kv_cache_for_mla(
+            compressed_kv,
+            k_pe,
+            attn_metadata,
+        )
+
+        # copy full_compressed_kv and full_k_pe from paged kv cache
+        full_compressed_kv, full_k_pe = trtllm_attention.load_paged_kv_cache_for_mla(
+            attn_metadata, q.dtype)
+        assert full_compressed_kv.shape[
+            0] == attn_metadata.num_ctx_cached_tokens + attn_metadata.num_ctx_tokens
+        assert full_compressed_kv.shape[1] == self.kv_lora_rank
+        assert full_k_pe.shape[
+            0] == attn_metadata.num_ctx_cached_tokens + attn_metadata.num_ctx_tokens
+        assert full_k_pe.shape[1] == self.qk_rope_head_dim
+        assert full_compressed_kv.is_contiguous()
+        assert full_k_pe.is_contiguous()
+
+        # compute full_k_nope and full_v from full_compressed_kv
+        full_kv = self.kv_b_proj(full_compressed_kv)
+        full_k_nope, full_v = full_kv.split(
+            [
+                self.num_heads * self.qk_nope_head_dim,
+                self.num_heads * self.v_head_dim
+            ],
+            -1,
+        )
+        full_k_nope = full_k_nope.view(-1, self.num_heads,
+                                       self.qk_nope_head_dim)
+        full_v = full_v.view(-1, self.num_heads, self.v_head_dim)
+
+        # build full_k and full_v
+        tokens_per_block = attn_metadata.kv_cache_manager.tokens_per_block
+        # paged kv cache should be initialized to 0 to avoid NaN
+        paged_full_kv = torch.zeros([
+            attn_metadata.num_contexts, 2,
+            (attn_metadata.max_ctx_kv_len + tokens_per_block - 1) //
+            tokens_per_block, self.num_heads, tokens_per_block,
+            max(self.qk_nope_head_dim + self.qk_rope_head_dim, self.v_head_dim)
+        ],
+                                    dtype=q.dtype,
+                                    device=q.device)
+        mla_context_kv_cache_block_offsets = trtllm_attention.set_paged_kv_cache_for_mla(
+            paged_full_kv,
+            full_k_nope,
+            full_v,
+            full_k_pe,
+            attn_metadata,
+        )
+
+        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
+        out_scale = None  # Currently we use BF16 MHA for context phase
+
+        attn_output = self.mha.forward(
+            q,
+            None,
+            None,
+            attn_metadata,
+            attention_input_type=AttentionInputType.context_only,
+            latent_cache=None,
+            out_scale=out_scale,
+            mla_context_paged_kv=paged_full_kv,
+            mla_context_kv_cache_block_offsets=
+            mla_context_kv_cache_block_offsets,
+        )
+
+        return attn_output
+
+    def forward_context(
+        self,
+        q: torch.Tensor,
+        compressed_kv: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        latent_cache: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+        if isinstance(self.mha, TrtllmAttention):
+            assert isinstance(attn_metadata, TrtllmAttentionMetadata)
+            trtllm_attention = cast(TrtllmAttention, self.mha)
+            if trtllm_attention.has_cached_kv_for_mla_context(attn_metadata):
+                return self.forward_context_with_cached_kv(
+                    q, compressed_kv, k_pe, attn_metadata, position_ids)
+        return self.forward_context_default(q, compressed_kv, k_pe,
+                                            attn_metadata, latent_cache)
+
+    def forward_generation(
+        self,
+        q: torch.Tensor,
+        compressed_kv: torch.Tensor,
+        k_pe: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        latent_cache: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        num_tokens = q.shape[0]
+        q_nope, q_pe = q.view([-1, self.num_heads, self.qk_head_dim]).split(
+            [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+
+        # fused_q contains 1) the result of the following bmm with shape [num_tokens, num_heads, kv_lora_rank]
+        # 2) rope(q_pe) with shape [num_tokens, num_heads, qk_rope_head_dim]. rope is applied inside AttentionOp
+        fused_q = torch.empty(
+            [
+                num_tokens, self.num_heads,
+                (self.kv_lora_rank + self.qk_rope_head_dim)
+            ],
+            dtype=q.dtype,
+            device=q.device,
+        )
+
+        if self.k_b_proj_trans.dtype == torch.bfloat16:
+            # [num_heads, num_tokens, self.qk_nope_head_dim]
+            q_nope_t = q_nope.transpose(0, 1)
+            # [num_heads, num_tokens, self.kv_lora_rank]
+            q_nope_out = fused_q[..., :self.kv_lora_rank].transpose(0, 1)
+
+            # [num_heads, num_tokens, self.qk_nope_head_dim] x [num_heads, kv_lora_rank, qk_nope_head_dim]
+            # -> [num_heads, num_tokens, kv_lora_rank] -> [num_tokens, num_heads, kv_lora_rank]
+            # The output of bmm is written directly into fused_q
+            torch.ops.trtllm.bmm_out(q_nope_t,
+                                     self.k_b_proj_trans.transpose(1, 2),
+                                     q_nope_out)
+        elif self.k_b_proj_trans.dtype == torch.float8_e4m3fn:
+            q_nope_fp8, q_nope_scales = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
+                q_nope)
+            # [num_heads, num_tokens, self.kv_lora_rank]
+            q_nope_out = fused_q[..., :self.kv_lora_rank].transpose(0, 1)
+
+            torch.ops.trtllm.fp8_block_scaling_bmm_out(
+                q_nope_fp8, self.k_b_proj_trans, q_nope_scales,
+                self.k_b_proj_trans_scale, q_nope_out)
+            q_nope_scales = None
+        else:
+            raise NotImplementedError(
+                f"Missing bmm impl for dtype: {self.k_b_proj_trans.dtype}.")
+
+        if self.apply_rotary_emb:
+            fused_q[..., self.kv_lora_rank:] = q_pe
+        fused_q = fused_q.view([
+            num_tokens,
+            self.num_heads * (self.kv_lora_rank + self.qk_rope_head_dim)
+        ])
+
+        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
+        out_scale = None  # Although we use FP8 MLA for generation phase, the output is still in BF16
+
+        attn_out_latent = self.mqa.forward(
+            fused_q,
+            None,
+            None,
+            attn_metadata,
+            attention_input_type=AttentionInputType.generation_only,
+            out_scale=out_scale,
+            latent_cache=latent_cache,  # kvcache and k_pe
+            q_pe=q_pe,  # used by `invokeMLARopeGeneration`
+        )
+        fused_q = None
+
+        assert (attn_out_latent.shape[0] == q.shape[0] and
+                attn_out_latent.shape[1] == self.num_heads * self.kv_lora_rank)
+
+        # [seq, num_heads, kv_lora_rank]
+        attn_out_latent = attn_out_latent.view(
+            [-1, self.num_heads, self.kv_lora_rank])
+
+        attn_output = torch.empty([num_tokens, self.num_heads, self.v_head_dim],
+                                  dtype=attn_out_latent.dtype,
+                                  device=attn_out_latent.device)
+
+        if self.v_b_proj.dtype == torch.bfloat16:
+            # [num_heads, seq, kv_lora_rank] x [num_heads, kv_lora_rank, v_head_dim]
+            # -> [num_heads, seq, v_head_dim]
+            torch.ops.trtllm.bmm_out(attn_out_latent.transpose(0, 1),
+                                     self.v_b_proj.transpose(1, 2),
+                                     attn_output.transpose(0, 1))
+        elif self.v_b_proj.dtype == torch.float8_e4m3fn:
+            attn_out_latent, attn_out_latent_scales = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
+                attn_out_latent)
+
+            torch.ops.trtllm.fp8_block_scaling_bmm_out(
+                attn_out_latent, self.v_b_proj, attn_out_latent_scales,
+                self.v_b_proj_scale, attn_output.transpose(0, 1))
+            attn_out_latent_scales = None
+        else:
+            raise NotImplementedError(
+                f"Missing bmm impl for dtype: {self.v_b_proj.dtype}.")
+
+        # [seq, num_heads * v_head_dim]
+        return attn_output.flatten(1, 2)
+
+    def forward(
+        self,
+        position_ids: Optional[torch.Tensor],
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        all_reduce_params: Optional[AllReduceParams] = None,
+    ) -> torch.Tensor:
+        if self.register_to_config:
+            attn_output = torch.ops.trtllm.mla_custom_op(
+                position_ids, hidden_states, self.layer_idx_str)
+        else:
+            attn_output = self.forward_impl(position_ids, hidden_states,
+                                            attn_metadata)
+        attn_output = self.o_proj(attn_output,
+                                  all_reduce_params=all_reduce_params)
+        return attn_output
diff --git a/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py b/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
index a60766d789..0835058eda 100644
--- a/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
+++ b/latest/_downloads/cba6509356738d5d6b4dcb3b7f52cf39/llm_args.py
@@ -1,14 +1,17 @@
 import json
 import math
+import os
 from abc import ABC, abstractmethod
-from dataclasses import dataclass, field, fields
+from dataclasses import dataclass, field
 from enum import Enum, EnumMeta
 from pathlib import Path
-from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
+from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Literal, Optional,
+                    Union)
 
 import torch
 import yaml
-from pydantic import BaseModel, Field, validator
+from pydantic import (BaseModel, Field, PrivateAttr, field_validator,
+                      model_validator)
 from strenum import StrEnum
 from transformers import PreTrainedTokenizerBase
 
@@ -17,23 +20,30 @@ from tensorrt_llm.lora_manager import (LoraConfig,
 
 from .._utils import mpi_rank
 from ..auto_parallel import AutoParallelConfig, infer_cluster_config
+
+if TYPE_CHECKING:
+    from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
+
 # yapf: disable
-from ..bindings.executor import BatchingType as _BatchingType
-from ..bindings.executor import \
-    CacheTransceiverConfig as _CacheTransceiverConfig
-from ..bindings.executor import \
-    CapacitySchedulerPolicy as _CapacitySchedulerPolicy
-from ..bindings.executor import ContextChunkingPolicy as _ContextChunkingPolicy
-from ..bindings.executor import DecodingConfig, DecodingMode
-from ..bindings.executor import DynamicBatchConfig as _DynamicBatchConfig
-from ..bindings.executor import EagleConfig, ExecutorConfig
-from ..bindings.executor import \
-    ExtendedRuntimePerfKnobConfig as _ExtendedRuntimePerfKnobConfig
-from ..bindings.executor import KvCacheConfig as _KvCacheConfig
-from ..bindings.executor import \
-    LookaheadDecodingConfig as _LookaheadDecodingConfig
-from ..bindings.executor import PeftCacheConfig as _PeftCacheConfig
-from ..bindings.executor import SchedulerConfig as _SchedulerConfig
+# isort: off
+from ..bindings.executor import (
+                                 BatchingType as _BatchingType,
+                                 CacheTransceiverConfig as _CacheTransceiverConfig,
+                                 CapacitySchedulerPolicy as _CapacitySchedulerPolicy,
+                                 ContextChunkingPolicy as _ContextChunkingPolicy,
+                                 DecodingConfig,
+                                 DecodingMode,
+                                 DynamicBatchConfig as _DynamicBatchConfig,
+                                 EagleConfig as _EagleConfig,
+                                 ExecutorConfig as _ExecutorConfig,
+                                 ExtendedRuntimePerfKnobConfig as _ExtendedRuntimePerfKnobConfig,
+                                 KvCacheConfig as _KvCacheConfig,
+                                 LookaheadDecodingConfig as _LookaheadDecodingConfig,
+                                 PeftCacheConfig as _PeftCacheConfig,
+                                 SchedulerConfig as _SchedulerConfig) # isort: skip
+# isort: on
+from transformers import PreTrainedTokenizerBase
+
 # yapf: enable
 from ..builder import BuildConfig, EngineConfig
 from ..logger import logger
@@ -195,7 +205,8 @@ class DecodingBaseConfig(BaseModel):
             "MTP": MTPDecodingConfig,
             "Medusa": MedusaDecodingConfig,
             "Eagle": EagleDecodingConfig,
-            "Lookahead": LookaheadDecodingConfig
+            "Lookahead": LookaheadDecodingConfig,
+            "NGram": NGramDecodingConfig,
         }
 
         config_class = config_classes.get(decoding_type)
@@ -228,6 +239,7 @@ class EagleDecodingConfig(DecodingBaseConfig):
     num_eagle_layers: Optional[int] = None
     max_non_leaves_per_layer: Optional[int] = None
     pytorch_eagle_weights_path: Optional[str] = None
+    eagle3_one_model: Optional[bool] = True
 
     @classmethod
     def from_dict(cls, data: dict):
@@ -236,6 +248,40 @@ class EagleDecodingConfig(DecodingBaseConfig):
     decoding_type: ClassVar[str] = "Eagle"
 
 
+class NGramDecodingConfig(DecodingBaseConfig):
+    """
+    Configuration for NGram drafter speculative decoding.
+
+    Arguments:
+        prompt_lookup_num_tokens: int
+                The length maximum of draft tokens (can be understood as length maximum of output draft tokens).
+
+        max_matching_ngram_size: int
+            The length maximum of searching tokens (can be understood as length maximum of input tokens to search).
+
+        is_keep_all: bool = True
+            Whether to keep all candidate pattern-matches pairs, only one match is kept for each pattern if False.
+
+        is_use_oldest: bool = True
+            Whether to provide the oldest match when pattern is hit, the newest one is provided if False.
+
+        is_public_pool: bool = True
+            Whether to use a common pool for all requests, or the pool is private for each request if False.
+    """
+
+    prompt_lookup_num_tokens: int = 2
+    max_matching_ngram_size: int = 4
+    is_keep_all: bool = True
+    is_use_oldest: bool = True
+    is_public_pool: bool = True
+
+    @classmethod
+    def from_dict(cls, data: dict):
+        return cls(**data)
+
+    decoding_type: ClassVar[str] = "NGram"
+
+
 class MTPDecodingConfig(DecodingBaseConfig):
     num_nextn_predict_layers: Optional[int] = 1
     use_relaxed_acceptance_for_thinking: Optional[bool] = False
@@ -512,7 +558,9 @@ class LookaheadDecodingConfig(DecodingBaseConfig, PybindMirror):
         get_default_lookahead_decoding_verification_set(),
         description="Number of NGrams in verification branch per step.")
 
-    @validator('max_window_size', 'max_ngram_size', 'max_verification_set_size')
+    @field_validator('max_window_size', 'max_ngram_size',
+                     'max_verification_set_size')
+    @classmethod
     def validate_positive_values(cls, v):
         if v <= 0:
             raise ValueError(f"Value must be positive, got {v}")
@@ -699,7 +747,10 @@ class _ModelWrapper:
         return self.model if isinstance(self.model, str) else None
 
 
-class LlmArgs(BaseModel):
+class BaseLlmArgs(BaseModel):
+    """
+    Base class for both TorchLlmArgs and TrtLlmArgs. It contains all the arguments that are common to both.
+    """
     model_config = {
         "arbitrary_types_allowed": True,
         "extra": "allow",
@@ -771,20 +822,11 @@ class LlmArgs(BaseModel):
     cp_config: Optional[dict] = Field(default_factory=dict,
                                       description="Context parallel config.")
 
-    auto_parallel: bool = Field(default=False,
-                                description="Enable auto parallel mode.")
-
-    auto_parallel_world_size: Optional[int] = Field(
-        default=None, description="The world size for auto parallel mode.")
-
     load_format: Literal['auto', 'dummy'] = Field(
         default='auto',
         description="The format to load the model.",
         json_schema_extra={"type": "Literal['auto', 'dummy']"})
 
-    enable_tqdm: bool = Field(default=False,
-                              description="Enable tqdm for progress bar.")
-
     # LoRA arguments
     enable_lora: bool = Field(default=False, description="Enable LoRA.")
 
@@ -816,18 +858,9 @@ class LlmArgs(BaseModel):
     quant_config: Optional[QuantConfig] = Field(
         default=None, description="Quantization config.")
 
-    calib_config: Optional[CalibConfig] = Field(
-        default=None, description="Calibration config.")
-
-    # BuildConfig is introduced to give users a familiar interface to configure the model building.
-    build_config: Optional[object] = Field(
-        default=None,
-        description="Build config.",
-        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
-
     # Several options from ExecutorConfig, expanded here for less hierarchy
-    kv_cache_config: Optional[KvCacheConfig] = Field(
-        default=None, description="KV cache config.")
+    kv_cache_config: KvCacheConfig = Field(default_factory=KvCacheConfig,
+                                           description="KV cache config.")
 
     enable_chunked_prefill: bool = Field(default=False,
                                          description="Enable chunked prefill.")
@@ -850,29 +883,12 @@ class LlmArgs(BaseModel):
         default=None,
         description="The maximum number of iterations for request stats.")
 
-    workspace: Optional[str] = Field(default=None,
-                                     description="The workspace for the model.")
-
     # A handful of options from PretrainedConfig
-    embedding_parallel_mode: str = Field(
-        default='SHARDING_ALONG_VOCAB',
-        description="The embedding parallel mode.")
-
-    fast_build: bool = Field(default=False, description="Enable fast build.")
-
-    # Once set, the model will reuse the build_cache
-    enable_build_cache: object = Field(
-        default=False,
-        description="Enable build cache.",
-        json_schema_extra={
-            "type": f"Union[{get_type_repr(BuildCacheConfig)}, bool]"
-        })
-
     peft_cache_config: Optional[PeftCacheConfig] = Field(
         default=None, description="PEFT cache config.")
 
-    scheduler_config: Optional[SchedulerConfig] = Field(
-        default=None, description="Scheduler config.")
+    scheduler_config: SchedulerConfig = Field(default_factory=SchedulerConfig,
+                                              description="Scheduler config.")
 
     cache_transceiver_config: Optional[CacheTransceiverConfig] = Field(
         default=None, description="Cache transceiver config.")
@@ -880,8 +896,8 @@ class LlmArgs(BaseModel):
     # Speculative decoding parameters
     speculative_config: Optional[Union[
         LookaheadDecodingConfig, MedusaDecodingConfig, EagleDecodingConfig,
-        MTPDecodingConfig]] = Field(default=None,
-                                    description="Speculative decoding config.")
+        MTPDecodingConfig, NGramDecodingConfig]] = Field(
+            default=None, description="Speculative decoding config.")
 
     batching_type: Optional[BatchingType] = Field(default=None,
                                                   description="Batching type.")
@@ -889,13 +905,6 @@ class LlmArgs(BaseModel):
     normalize_log_probs: bool = Field(
         default=False, description="Normalize log probabilities.")
 
-    gather_generation_logits: bool = Field(
-        default=False, description="Gather generation logits.")
-
-    extended_runtime_perf_knob_config: Optional[
-        ExtendedRuntimePerfKnobConfig] = Field(
-            default=None, description="Extended runtime perf knob config.")
-
     max_batch_size: Optional[int] = Field(default=None,
                                           description="The maximum batch size.")
 
@@ -916,6 +925,9 @@ class LlmArgs(BaseModel):
                                    description="The backend to use.",
                                    exclude=True)
 
+    gather_generation_logits: bool = Field(
+        default=False, description="Gather generation logits.")
+
     # private fields those are unstable and just for internal use
     num_postprocess_workers: int = Field(
         default=0,
@@ -988,40 +1000,19 @@ class LlmArgs(BaseModel):
             moe_tp_size=self.moe_tensor_parallel_size,
             moe_ep_size=self.moe_expert_parallel_size,
             enable_attention_dp=self.enable_attention_dp,
-            cp_config=self.cp_config,
-            auto_parallel=self.auto_parallel)
-        if self.parallel_config.auto_parallel:
-            self.parallel_config.world_size = self.auto_parallel_world_size
-
-        self.auto_parallel_config = AutoParallelConfig(
-            sharded_io_allowlist=[
-                "past_key_value_\\d+",
-                "present_key_value_\\d*",
-            ],
-            same_buffer_io={
-                "past_key_value_(\\d+)": "present_key_value_\\1",
-            },
-            **infer_cluster_config(),
-        )
-
-        self.kv_cache_config = self.kv_cache_config or KvCacheConfig()
-
-        self.scheduler_config = self.scheduler_config or SchedulerConfig()
-
-        # This is used to hold th options for convert_checkpoint
-        self._convert_checkpoint_options = {}
+            cp_config=self.cp_config)
 
     @classmethod
-    def from_kwargs(cls, **kwargs: Any) -> "LlmArgs":
+    def from_kwargs(cls, **kwargs: Any) -> "BaseLlmArgs":
         """Create `LlmArgs` instance from kwargs.
 
         Args:
             kwargs (Any): Arguments passed to `LlmArgs` constructor.
 
         Returns:
-            tensorrt_llm.llmapi.llm_utils.LlmArgs: The `LlmArgs` instance.
+            tensorrt_llm.llmapi.llm_utils.BaseLlmArgs: The `BaseLlmArgs` instance.
         """
-        kwargs = LlmArgs._maybe_update_config_for_consistency(dict(kwargs))
+        kwargs = BaseLlmArgs._maybe_update_config_for_consistency(dict(kwargs))
         ret = cls(**kwargs)
         ret._setup()
         return ret
@@ -1032,8 +1023,7 @@ class LlmArgs(BaseModel):
         Returns:
             dict: The dict that contains all fields of the `LlmArgs` instance.
         """
-        return dict(
-            (field.name, getattr(self, field.name)) for field in fields(self))
+        return self.model_dump()
 
     @staticmethod
     def _maybe_update_config_for_consistency(
@@ -1041,18 +1031,18 @@ class LlmArgs(BaseModel):
         # max_beam_width is not included since vague behavior due to lacking the support for dynamic beam width during
         # generation
         black_list = set(["max_beam_width"])
-        executor_config_attrs = set(attr for attr in dir(ExecutorConfig)
-                                    if not attr.startswith('_')
-                                    and callable(getattr(ExecutorConfig, attr)))
+        executor_config_attrs = set(
+            attr for attr in dir(_ExecutorConfig) if not attr.startswith('_')
+            and callable(getattr(_ExecutorConfig, attr)))
         executor_config_attrs -= black_list
-        llm_args_attr = set(LlmArgs.model_fields.keys())
-        # NOTE: When cpp ExecutorConfig add new options, please add the new options into `_LlmArgs` with docs as well
+        llm_args_attr = set(BaseLlmArgs.model_fields.keys())
+        # NOTE: When cpp ExecutorConfig add new options, please add the new options into `LlmArgs` with docs as well
         # ASK chunweiy for help if you are not sure about the new options.
         assert executor_config_attrs.issubset(
             llm_args_attr
         ), f"New options found in underlying ExecutorConfig: {llm_args_attr - executor_config_attrs}"
 
-        # ensure build_config and LlmArgs consistency
+        # ensure build_config and LlmArgsBase consistency
         if kwargs_dict.get("backend") != "pytorch" and kwargs_dict.get(
                 "build_config"):
             # TODO: move this to _perform_config_arbitration() once it's default-on.
@@ -1062,11 +1052,11 @@ class LlmArgs(BaseModel):
                 build_val = getattr(kwargs_dict["build_config"], field_name,
                                     None)
                 llmargs_val = kwargs_dict.get(
-                    field_name) or LlmArgs.model_fields[field_name]
+                    field_name) or BaseLlmArgs.model_fields[field_name]
 
                 if build_val != llmargs_val:
                     logger.warning(
-                        f"Overriding LlmArgs.{field_name} ({llmargs_val}) with build_config.{field_name} ({build_val})."
+                        f"Overriding LlmArgsBase.{field_name} ({llmargs_val}) with build_config.{field_name} ({build_val})."
                     )
                     kwargs_dict[field_name] = build_val
 
@@ -1075,12 +1065,15 @@ class LlmArgs(BaseModel):
     def _setup(self):
         ''' This method will setup the configs right before building the model. '''
 
+        is_trt_llm_args = isinstance(self, TrtLlmArgs)
+
         assert isinstance(self.model,
                           (str, Path)), f"Invalid model: {self.model}"
 
-        self._setup_embedding_parallel_mode()
+        if is_trt_llm_args:
+            self._setup_embedding_parallel_mode()
 
-        if self.enable_build_cache:
+        if is_trt_llm_args and self.enable_build_cache:
             self.enable_build_cache = BuildCacheConfig() if isinstance(
                 self.enable_build_cache, bool) else self.enable_build_cache
             if not isinstance(self.enable_build_cache, BuildCacheConfig):
@@ -1121,7 +1114,8 @@ class LlmArgs(BaseModel):
 
         self.quant_config = self.quant_config or QuantConfig()
 
-        self.calib_config = self.calib_config or CalibConfig()
+        if is_trt_llm_args:
+            self.calib_config = self.calib_config or CalibConfig()
 
         # Note: max_batch_size and max_num_tokens in LlmArgs are for runtime,
         # which will be passed to the C++ Executor API, overwriting the values
@@ -1148,8 +1142,9 @@ class LlmArgs(BaseModel):
                 self.build_config.max_num_tokens = self.max_num_tokens
 
         # TODO: remove the checker when manage weights support all data types
-        if self.fast_build and (self.quant_config.quant_algo is QuantAlgo.FP8
-                                or self.quant_config.quant_algo is None):
+        if is_trt_llm_args and self.fast_build and (
+                self.quant_config.quant_algo is QuantAlgo.FP8
+                or self.quant_config.quant_algo is None):
             self._update_plugin_config("manage_weights", True)
 
         if self.parallel_config._world_size == 1:
@@ -1162,9 +1157,12 @@ class LlmArgs(BaseModel):
             if self.max_lora_rank is not None:
                 self.build_config.lora_config.max_lora_rank = self.max_lora_rank
 
+        self._setup_speculative_config()
+
         if self.enable_prompt_adapter:
             self.build_config.max_prompt_embedding_table_size = self.max_prompt_adapter_token * self.build_config.max_batch_size
 
+    def _setup_speculative_config(self):
         if self.speculative_config:
             if isinstance(self.speculative_config, LookaheadDecodingConfig):
                 lookahead_config = self.speculative_config
@@ -1194,7 +1192,7 @@ class LlmArgs(BaseModel):
                 self.build_config.max_draft_len = self.speculative_config.max_draft_len
 
                 if self.backend != 'pytorch':
-                    eagle_config = EagleConfig(
+                    eagle_config = _EagleConfig(
                         self.speculative_config.eagle_choices,
                         self.speculative_config.greedy_sampling,
                         self.speculative_config.posterior_threshold,
@@ -1207,9 +1205,25 @@ class LlmArgs(BaseModel):
                     from tensorrt_llm._torch.speculative import Eagle3Config
                     self.speculative_config = Eagle3Config(
                         max_draft_tokens=self.speculative_config.max_draft_len,
-                        eagle_weights_path=self.speculative_config.
-                        pytorch_eagle_weights_path)
-
+                        draft_model_path=self.speculative_config.
+                        pytorch_eagle_weights_path,
+                        eagle3_one_model=self.speculative_config.
+                        eagle3_one_model)
+            elif isinstance(self.speculative_config, NGramDecodingConfig):
+                self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.NGRAM
+                assert self.backend == 'pytorch'
+                assert self.speculative_config.prompt_lookup_num_tokens > 0 and self.speculative_config.max_matching_ngram_size > 0
+                self.build_config.max_draft_len = self.speculative_config.max_draft_len
+                from tensorrt_llm._torch.speculative import NGramConfig
+                self.speculative_config = NGramConfig(
+                    prompt_lookup_num_tokens=self.speculative_config.
+                    prompt_lookup_num_tokens,
+                    max_matching_ngram_size=self.speculative_config.
+                    max_matching_ngram_size,
+                    is_keep_all=self.speculative_config.is_keep_all,
+                    is_use_oldest=self.speculative_config.is_use_oldest,
+                    is_public_pool=self.speculative_config.is_public_pool,
+                )
             elif isinstance(self.speculative_config, MTPDecodingConfig):
                 from tensorrt_llm._torch.speculative import MTPConfig
                 self.speculative_config = MTPConfig(
@@ -1350,32 +1364,385 @@ class LlmArgs(BaseModel):
                 f"Invalid embedding_parallel_mode: {self.llm_args.embedding_parallel_mode}"
             )
 
-    def _validate_kv_cache_config(self):
-        if self.kv_cache_config is None:
-            raise ValueError("KvCacheConfig is required for streaming LLM.")
 
-        if self.kv_cache_config.max_attention_window is None:
-            raise ValueError(
-                "KvCacheConfig.max_attention_window should be set for streaming LLM."
-            )
-        if any(i <= 0 for i in self.kv_cache_config.max_attention_window):
-            raise ValueError(
-                "Elements in KvCacheConfig.max_attention_window should be greater than 0."
-            )
+class TrtLlmArgs(BaseLlmArgs):
 
-        if self.kv_cache_config.sink_token_length is None:
-            raise ValueError(
-                "KvCacheConfig.sink_token_length should be set for streaming LLM."
-            )
-        if self.kv_cache_config.sink_token_length <= 0:
-            raise ValueError(
-                "KvCacheConfig.sink_token_length should be greater than 0.")
+    auto_parallel: bool = Field(
+        default=False,
+        description="Enable auto parallel mode.",
+        deprecated=
+        "Use tensor_parallel_size/pipeline_parallel_size/xxx_parallel_size instead.",
+    )
 
+    auto_parallel_world_size: Optional[int] = Field(
+        default=None,
+        description="The world size for auto parallel mode.",
+        deprecated=
+        "Use tensor_parallel_size/pipeline_parallel_size/xxx_parallel_size instead.",
+    )
+
+    enable_tqdm: bool = Field(default=False,
+                              description="Enable tqdm for progress bar.")
+
+    # BuildConfig is introduced to give users a familiar interface to configure the model building.
+    build_config: Optional[object] = Field(
+        default=None,
+        description="Build config.",
+        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
+
+    workspace: Optional[str] = Field(default=None,
+                                     description="The workspace for the model.")
+
+    # Once set, the model will reuse the build_cache
+    enable_build_cache: object = Field(
+        default=False,
+        description="Enable build cache.",
+        json_schema_extra={
+            "type": f"Union[{get_type_repr(BuildCacheConfig)}, bool]"
+        })
+
+    extended_runtime_perf_knob_config: Optional[
+        ExtendedRuntimePerfKnobConfig] = Field(
+            default=None, description="Extended runtime perf knob config.")
+
+    calib_config: Optional[CalibConfig] = Field(
+        default=None, description="Calibration config.")
+
+    embedding_parallel_mode: str = Field(
+        default='SHARDING_ALONG_VOCAB',
+        description="The embedding parallel mode.")
+
+    fast_build: bool = Field(default=False, description="Enable fast build.")
+
+    # Private attributes
+    _auto_parallel_config: Optional[AutoParallelConfig] = PrivateAttr(
+        default=None)
+    # This is used to hold the options for convert_checkpoint
+    _convert_checkpoint_options: Dict[str,
+                                      Any] = PrivateAttr(default_factory=dict)
+
+    @property
+    def auto_parallel_config(self) -> AutoParallelConfig:
+        return self._auto_parallel_config
+
+    @print_traceback_on_error
+    def model_post_init(self, __context):
+        super().model_post_init(__context)
+
+        self._auto_parallel_config = AutoParallelConfig(
+            sharded_io_allowlist=[
+                "past_key_value_\\d+",
+                "present_key_value_\\d*",
+            ],
+            same_buffer_io={
+                "past_key_value_(\\d+)": "present_key_value_\\1",
+            },
+            **infer_cluster_config(),
+        )
+
+        self.parallel_config.auto_parallel = self.auto_parallel
+
+        if self.parallel_config.auto_parallel:
+            self.parallel_config.world_size = self.auto_parallel_world_size
+
+
+LlmArgs = TrtLlmArgs
 
 LLMARGS_EXPLICIT_DOCSTRING = generate_api_docs_as_docstring(LlmArgs,
                                                             indent=' ' * 4)
 
 
+class LoadFormat(Enum):
+    AUTO = 0
+    # Initialize all weights randomly.
+    DUMMY = 1
+
+
+class TorchLlmArgs(BaseLlmArgs):
+
+    # Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs
+    build_config: Optional[object] = Field(
+        default=None,
+        description="Build config.",
+        exclude_from_json=True,
+        json_schema_extra={"type": f"Optional[{get_type_repr(BuildConfig)}]"})
+
+    # PyTorch backend specific configurations
+
+    use_cuda_graph: bool = Field(
+        default=False,
+        description=
+        "If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_batch_sizes, and are enabled for batches that consist of decoding requests *only* (the reason is that it's hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory."
+    )
+
+    cuda_graph_batch_sizes: Optional[List[int]] = Field(
+        default=None,
+        description="List of batch sizes to create CUDA graphs for.")
+
+    cuda_graph_max_batch_size: int = Field(
+        default=0, description="Maximum batch size for CUDA graphs.")
+
+    cuda_graph_padding_enabled: bool = Field(
+        default=False,
+        description=
+        "If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."
+    )
+
+    disable_overlap_scheduler: bool = Field(
+        default=False, description="Disable the overlap scheduler.")
+
+    moe_max_num_tokens: Optional[int] = Field(
+        default=None,
+        description=
+        "If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used."
+    )
+
+    moe_load_balancer: Optional[Union[object, str]] = Field(
+        default=None,
+        description="Configuration for MoE load balancing.",
+        json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
+
+    attn_backend: str = Field(default='TRTLLM',
+                              description="Attention backend to use.")
+
+    moe_backend: str = Field(default='CUTLASS',
+                             description="MoE backend to use.")
+
+    mixed_sampler: bool = Field(
+        default=False,
+        description=
+        "If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc."
+    )
+
+    enable_trtllm_sampler: bool = Field(
+        default=False,
+        description=
+        "If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies."
+    )
+
+    kv_cache_dtype: str = Field(default="auto",
+                                description="Data type for KV cache.")
+
+    use_kv_cache: bool = Field(default=True,
+                               description="Whether to use KV cache.")
+
+    enable_iter_perf_stats: bool = Field(
+        default=False, description="Enable iteration performance statistics.")
+
+    enable_iter_req_stats: bool = Field(
+        default=False,
+        description=
+        "If true, enables per request stats per iteration. Must also set enable_iter_perf_stats to true to get request stats."
+    )
+
+    print_iter_log: bool = Field(default=False,
+                                 description="Print iteration logs.")
+
+    torch_compile_enabled: bool = Field(
+        default=False, description="Enable torch.compile optimization.")
+
+    torch_compile_fullgraph: bool = Field(
+        default=True,
+        description="Enable full graph compilation in torch.compile.")
+
+    torch_compile_inductor_enabled: bool = Field(
+        default=False, description="Enable inductor backend in torch.compile.")
+
+    torch_compile_piecewise_cuda_graph: bool = Field(
+        default=False,
+        description="Enable piecewise CUDA graph in torch.compile.")
+
+    torch_compile_enable_userbuffers: bool = Field(
+        default=True,
+        description=
+        "When torch compile is enabled, userbuffers is enabled by default.")
+
+    autotuner_enabled: bool = Field(
+        default=True,
+        description="Enable autotuner only when torch compile is enabled.")
+
+    enable_layerwise_nvtx_marker: bool = Field(
+        default=False, description="If true, enable layerwise nvtx marker.")
+
+    auto_deploy_config: Optional[object] = Field(
+        default=None,
+        description="Auto deploy config.",
+        exclude_from_json=True,
+        json_schema_extra={"type": f"Optional[AutoDeployConfig]"})
+
+    load_format: Union[str, LoadFormat] = Field(
+        default=LoadFormat.AUTO,
+        description=
+        "How to load the model weights. By default, detect the weight type from the model checkpoint."
+    )
+
+    enable_min_latency: bool = Field(
+        default=False,
+        description=
+        "If true, enable min-latency mode. Currently only used for Llama4.",
+    )
+
+    @field_validator('load_format', mode='before')
+    @classmethod
+    def convert_load_format(cls, v):
+        if isinstance(v, LoadFormat):
+            return v
+        load_format = v.upper()
+        if load_format not in LoadFormat.__members__:
+            raise ValueError(f"Invalid LoadFormat: {v}")
+        return LoadFormat[load_format]
+
+    # Extra resource managers to use in addition to the KV cache manager.
+    # Each manager's prepare_resources method is called before the forward pass,
+    # and update_resources() is called after the pass finishes. free_resources()
+    # is called when a request finishes. The KV cache manager is guaranteed to
+    # be invoked after all of these extra managers in all stages.
+    _extra_resource_managers: Dict[str,
+                                   object] = PrivateAttr(default_factory=dict, )
+
+    @property
+    def extra_resource_managers(self) -> Dict[str, object]:
+        return self._extra_resource_managers
+
+    @extra_resource_managers.setter
+    def extra_resource_managers(self, value: Dict[str, object]) -> None:
+        self._extra_resource_managers = value
+
+    @print_traceback_on_error
+    def model_post_init(self, __context):
+        from .._torch.model_config import MoeLoadBalancerConfig
+
+        super().model_post_init(__context)
+        self.model_format = _ModelFormatKind.HF
+
+        if isinstance(self.moe_load_balancer, str):
+            if not os.path.exists(self.moe_load_balancer):
+                raise FileNotFoundError(
+                    f"MoE load balancer config file not found: {self.moe_load_balancer}"
+                )
+            try:
+                with open(self.moe_load_balancer) as f:
+                    moe_load_balancer_config = yaml.safe_load(f)
+                self.moe_load_balancer = MoeLoadBalancerConfig(
+                    **moe_load_balancer_config)
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to load MoE load balancer config file: {self.moe_load_balancer}"
+                ) from e
+
+    # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
+    def get_pytorch_backend_config(self) -> "PyTorchConfig":
+        from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
+
+        # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
+        # Just a WAR to support the auto_deploy
+        if self.auto_deploy_config is not None:
+            return self.auto_deploy_config
+
+        return PyTorchConfig(
+            extra_resource_managers=self.extra_resource_managers,
+            use_cuda_graph=self.use_cuda_graph,
+            cuda_graph_batch_sizes=self.cuda_graph_batch_sizes,
+            cuda_graph_max_batch_size=self.cuda_graph_max_batch_size,
+            cuda_graph_padding_enabled=self.cuda_graph_padding_enabled,
+            disable_overlap_scheduler=self.disable_overlap_scheduler,
+            moe_max_num_tokens=self.moe_max_num_tokens,
+            moe_load_balancer=self.moe_load_balancer,
+            attn_backend=self.attn_backend,
+            moe_backend=self.moe_backend,
+            mixed_sampler=self.mixed_sampler,
+            enable_trtllm_sampler=self.enable_trtllm_sampler,
+            kv_cache_dtype=self.kv_cache_dtype,
+            use_kv_cache=self.use_kv_cache,
+            enable_iter_perf_stats=self.enable_iter_perf_stats,
+            enable_iter_req_stats=self.enable_iter_req_stats,
+            print_iter_log=self.print_iter_log,
+            torch_compile_enabled=self.torch_compile_enabled,
+            torch_compile_fullgraph=self.torch_compile_fullgraph,
+            torch_compile_inductor_enabled=self.torch_compile_inductor_enabled,
+            torch_compile_piecewise_cuda_graph=self.
+            torch_compile_piecewise_cuda_graph,
+            torch_compile_enable_userbuffers=self.
+            torch_compile_enable_userbuffers,
+            autotuner_enabled=self.autotuner_enabled,
+            enable_layerwise_nvtx_marker=self.enable_layerwise_nvtx_marker,
+            load_format=self.load_format,
+            enable_min_latency=self.enable_min_latency)
+
+    @field_validator('cuda_graph_max_batch_size')
+    @classmethod
+    def validate_cuda_graph_max_batch_size(cls, v):
+        """Validate cuda_graph_max_batch_size is non-negative."""
+        if v < 0:
+            raise ValueError("cuda_graph_max_batch_size must be non-negative")
+        return v
+
+    @staticmethod
+    def _generate_cuda_graph_batch_sizes(max_batch_size: int,
+                                         padding_enabled: bool) -> List[int]:
+        """Generate a list of batch sizes for CUDA graphs.
+
+        Args:
+            max_batch_size: Maximum batch size to generate up to
+            padding_enabled: Whether padding is enabled, which affects the batch size distribution
+
+        Returns:
+            List of batch sizes to create CUDA graphs for
+        """
+        if padding_enabled:
+            batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)]
+        else:
+            batch_sizes = list(range(1, 32)) + [32, 64, 128]
+
+        # Add powers of 2 up to max_batch_size
+        batch_sizes += [
+            2**i for i in range(8, math.floor(math.log(max_batch_size, 2)))
+        ]
+
+        # Filter and sort batch sizes
+        batch_sizes = sorted(
+            [size for size in batch_sizes if size <= max_batch_size])
+
+        # Add max_batch_size if not already included
+        if max_batch_size != batch_sizes[-1]:
+            batch_sizes.append(max_batch_size)
+
+        return batch_sizes
+
+    @model_validator(mode='after')
+    def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
+        """Validate CUDA graph configuration.
+
+        Ensures that:
+        1. If cuda_graph_batch_sizes is provided, cuda_graph_max_batch_size must be 0
+        2. If cuda_graph_batch_sizes is not provided, it is generated based on cuda_graph_max_batch_size
+        3. If both are provided, cuda_graph_batch_sizes must match the generated values
+        """
+        if self.cuda_graph_batch_sizes is not None:
+            self.cuda_graph_batch_sizes = sorted(self.cuda_graph_batch_sizes)
+            if self.cuda_graph_max_batch_size != 0:
+                if self.cuda_graph_batch_sizes != self._generate_cuda_graph_batch_sizes(
+                        self.cuda_graph_max_batch_size,
+                        self.cuda_graph_padding_enabled):
+                    raise ValueError(
+                        "Please don't set both cuda_graph_batch_sizes "
+                        "and cuda_graph_max_batch_size.\n"
+                        f"cuda_graph_batch_sizes: {self.cuda_graph_batch_sizes}, "
+                        f"cuda_graph_max_batch_size: {self.cuda_graph_max_batch_size}"
+                    )
+            else:
+                self.cuda_graph_max_batch_size = max(
+                    self.cuda_graph_batch_sizes)
+        else:
+            max_batch_size = self.cuda_graph_max_batch_size or 128
+            generated_sizes = self._generate_cuda_graph_batch_sizes(
+                max_batch_size, self.cuda_graph_padding_enabled)
+            self.cuda_graph_batch_sizes = generated_sizes
+            self.cuda_graph_max_batch_size = max_batch_size
+
+        return self
+
+
 def update_llm_args_with_extra_dict(
         llm_args: Dict,
         llm_args_dict: Dict,
diff --git a/latest/_images/8x_l20_L40S_node_architecture.png b/latest/_images/8x_l20_L40S_node_architecture.png
new file mode 100644
index 0000000000..725427f163
Binary files /dev/null and b/latest/_images/8x_l20_L40S_node_architecture.png differ
diff --git a/latest/_images/tech_blog3_mla_absorb.png b/latest/_images/tech_blog3_mla_absorb.png
new file mode 100644
index 0000000000..ef08436db5
Binary files /dev/null and b/latest/_images/tech_blog3_mla_absorb.png differ
diff --git a/latest/_modules/index.html b/latest/_modules/index.html
index 7e4551e2ba..d43b59af11 100644
--- a/latest/_modules/index.html
+++ b/latest/_modules/index.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -680,6 +684,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/builder.html b/latest/_modules/tensorrt_llm/builder.html
index 7c61c14bd0..7d244c86a4 100644
--- a/latest/_modules/tensorrt_llm/builder.html
+++ b/latest/_modules/tensorrt_llm/builder.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1986,6 +1990,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/disaggregated_params.html b/latest/_modules/tensorrt_llm/disaggregated_params.html
index 76c164667d..d717c7998e 100644
--- a/latest/_modules/tensorrt_llm/disaggregated_params.html
+++ b/latest/_modules/tensorrt_llm/disaggregated_params.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -661,6 +665,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/executor/result.html b/latest/_modules/tensorrt_llm/executor/result.html
index 4be5107008..5d2f55c78f 100644
--- a/latest/_modules/tensorrt_llm/executor/result.html
+++ b/latest/_modules/tensorrt_llm/executor/result.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -760,6 +764,10 @@
                 <span class="n">output</span><span class="o">.</span><span class="n">finish_reason</span> <span class="o">=</span> <span class="s1">&#39;length&#39;</span>
             <span class="k">elif</span> <span class="n">finish_reasons</span><span class="p">[</span><span class="n">src_idx</span><span class="p">]</span> <span class="o">==</span> <span class="n">tllm</span><span class="o">.</span><span class="n">FinishReason</span><span class="o">.</span><span class="n">TIMED_OUT</span><span class="p">:</span>
                 <span class="n">output</span><span class="o">.</span><span class="n">finish_reason</span> <span class="o">=</span> <span class="s1">&#39;timeout&#39;</span>
+            <span class="c1"># For disaggregated serving, finish reason might be NOT_FINISHED which is ok</span>
+            <span class="k">elif</span> <span class="n">finish_reasons</span><span class="p">[</span>
+                    <span class="n">src_idx</span><span class="p">]</span> <span class="o">==</span> <span class="n">tllm</span><span class="o">.</span><span class="n">FinishReason</span><span class="o">.</span><span class="n">NOT_FINISHED</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">disaggregated_params</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">disaggregated_params</span><span class="o">.</span><span class="n">request_type</span> <span class="o">==</span> <span class="s2">&quot;context_only&quot;</span><span class="p">:</span>
+                <span class="n">output</span><span class="o">.</span><span class="n">finish_reason</span> <span class="o">=</span> <span class="s1">&#39;not_finished&#39;</span>
             <span class="k">elif</span> <span class="n">finish_reasons</span><span class="p">[</span><span class="n">src_idx</span><span class="p">]</span> <span class="o">==</span> <span class="n">tllm</span><span class="o">.</span><span class="n">FinishReason</span><span class="o">.</span><span class="n">CANCELLED</span><span class="p">:</span>
                 <span class="k">pass</span>
             <span class="k">else</span><span class="p">:</span>
@@ -1262,6 +1270,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/executor/utils.html b/latest/_modules/tensorrt_llm/executor/utils.html
index 641f4b24d3..eeced86c7f 100644
--- a/latest/_modules/tensorrt_llm/executor/utils.html
+++ b/latest/_modules/tensorrt_llm/executor/utils.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -510,6 +514,8 @@
 <span class="kn">from</span><span class="w"> </span><span class="nn">queue</span><span class="w"> </span><span class="kn">import</span> <span class="n">Empty</span><span class="p">,</span> <span class="n">Queue</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">NamedTuple</span><span class="p">,</span> <span class="n">Optional</span>
 
+<span class="kn">from</span><span class="w"> </span><span class="nn">strenum</span><span class="w"> </span><span class="kn">import</span> <span class="n">StrEnum</span>
+
 <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">mpi_rank</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">Response</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">print_colored_debug</span>
@@ -519,18 +525,35 @@
                                   <span class="n">RemoteMpiCommSessionClient</span><span class="p">)</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..llmapi.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">print_colored_debug</span>
 
+
+<span class="k">class</span><span class="w"> </span><span class="nc">LlmLauncherEnvs</span><span class="p">(</span><span class="n">StrEnum</span><span class="p">):</span>
+    <span class="c1"># Spawn a process for the LLM-API Proxy</span>
+    <span class="n">TLLM_SPAWN_PROXY_PROCESS</span> <span class="o">=</span> <span class="s2">&quot;TLLM_SPAWN_PROXY_PROCESS&quot;</span>
+    <span class="n">TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR</span> <span class="o">=</span> <span class="s2">&quot;TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR&quot;</span>
+    <span class="n">TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY</span> <span class="o">=</span> <span class="s2">&quot;TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY&quot;</span>
+
+    <span class="c1"># Whether to use periodical responses handler in await_responses</span>
+    <span class="n">TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT</span> <span class="o">=</span> <span class="s2">&quot;TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT&quot;</span>
+
+
 <span class="n">PERIODICAL_RESP_IN_AWAIT</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span>
-    <span class="s2">&quot;TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT&quot;</span><span class="p">)</span> <span class="o">==</span> <span class="s2">&quot;1&quot;</span>
+    <span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_EXECUTOR_PERIODICAL_RESP_IN_AWAIT</span><span class="p">)</span> <span class="o">==</span> <span class="s2">&quot;1&quot;</span>
 
 
 <span class="k">def</span><span class="w"> </span><span class="nf">get_spawn_proxy_process_ipc_addr_env</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="nb">str</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&#39;&#39;&#39; Get the IPC address for the spawn proxy process dynamically. &#39;&#39;&#39;</span>
-    <span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">&quot;TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR&quot;</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR</span><span class="p">)</span>
+
+
+<span class="k">def</span><span class="w"> </span><span class="nf">get_spawn_proxy_process_ipc_hmac_key_env</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="nb">bytes</span> <span class="o">|</span> <span class="kc">None</span><span class="p">:</span>
+<span class="w">    </span><span class="sd">&#39;&#39;&#39; Get the HMAC key for the spawn proxy process dynamically. &#39;&#39;&#39;</span>
+    <span class="k">if</span> <span class="n">key</span> <span class="o">:=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">&quot;TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY&quot;</span><span class="p">):</span>
+        <span class="k">return</span> <span class="nb">bytes</span><span class="o">.</span><span class="n">fromhex</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
 
 
 <span class="k">def</span><span class="w"> </span><span class="nf">get_spawn_proxy_process_env</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
 <span class="w">    </span><span class="sd">&#39;&#39;&#39; Get the environment variable for the spawn proxy process dynamically. &#39;&#39;&#39;</span>
-    <span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">&quot;TLLM_SPAWN_PROXY_PROCESS&quot;</span><span class="p">)</span> <span class="o">==</span> <span class="s2">&quot;1&quot;</span>
+    <span class="k">return</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_SPAWN_PROXY_PROCESS</span><span class="p">)</span> <span class="o">==</span> <span class="s2">&quot;1&quot;</span>
 
 
 <span class="k">if</span> <span class="n">PERIODICAL_RESP_IN_AWAIT</span><span class="p">:</span>
@@ -543,14 +566,11 @@
     <span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="sa">f</span><span class="s2">&quot;create_mpi_comm_session must be called by rank 0, but it was called by rank </span><span class="si">{</span><span class="n">mpi_rank</span><span class="p">()</span><span class="si">}</span><span class="s2">&quot;</span>
     <span class="k">if</span> <span class="n">get_spawn_proxy_process_env</span><span class="p">():</span>
         <span class="k">assert</span> <span class="n">get_spawn_proxy_process_ipc_addr_env</span><span class="p">(</span>
-        <span class="p">),</span> <span class="s2">&quot;TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR is not set.&quot;</span>
+        <span class="p">),</span> <span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">LlmLauncherEnvs</span><span class="o">.</span><span class="n">TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR</span><span class="si">}</span><span class="s2"> is not set.&quot;</span>
         <span class="n">print_colored_debug</span><span class="p">(</span>
             <span class="sa">f</span><span class="s2">&quot;Using RemoteMpiPoolSessionClient to bind to external MPI processes at </span><span class="si">{</span><span class="n">get_spawn_proxy_process_ipc_addr_env</span><span class="p">()</span><span class="si">}</span><span class="se">\n</span><span class="s2">&quot;</span><span class="p">,</span>
             <span class="s2">&quot;yellow&quot;</span><span class="p">)</span>
-        <span class="n">hmac_key</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">getenv</span><span class="p">(</span><span class="s2">&quot;TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY&quot;</span><span class="p">)</span>
-        <span class="c1"># Convert the hex string to bytes</span>
-        <span class="k">if</span> <span class="n">hmac_key</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="n">hmac_key</span> <span class="o">=</span> <span class="nb">bytes</span><span class="o">.</span><span class="n">fromhex</span><span class="p">(</span><span class="n">hmac_key</span><span class="p">)</span>
+        <span class="n">hmac_key</span> <span class="o">=</span> <span class="n">get_spawn_proxy_process_ipc_hmac_key_env</span><span class="p">()</span>
         <span class="k">return</span> <span class="n">RemoteMpiCommSessionClient</span><span class="p">(</span>
             <span class="n">addr</span><span class="o">=</span><span class="n">get_spawn_proxy_process_ipc_addr_env</span><span class="p">(),</span> <span class="n">hmac_key</span><span class="o">=</span><span class="n">hmac_key</span><span class="p">)</span>
     <span class="k">else</span><span class="p">:</span>
@@ -758,6 +778,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/functional.html b/latest/_modules/tensorrt_llm/functional.html
index 90bf679321..e9badfb41b 100644
--- a/latest/_modules/tensorrt_llm/functional.html
+++ b/latest/_modules/tensorrt_llm/functional.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -4727,7 +4731,8 @@
     <span class="n">UB</span> <span class="o">=</span> <span class="mi">2</span>
     <span class="n">AUTO</span> <span class="o">=</span> <span class="mi">3</span>
     <span class="n">ONESHOT</span> <span class="o">=</span> <span class="mi">4</span>
-    <span class="n">TWOSHOT</span> <span class="o">=</span> <span class="mi">5</span></div>
+    <span class="n">TWOSHOT</span> <span class="o">=</span> <span class="mi">5</span>
+    <span class="n">LOWPRECISION</span> <span class="o">=</span> <span class="mi">6</span></div>
 
 
 
@@ -8673,6 +8678,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/layers/activation.html b/latest/_modules/tensorrt_llm/layers/activation.html
index 5ea2653c31..6f42b49a0b 100644
--- a/latest/_modules/tensorrt_llm/layers/activation.html
+++ b/latest/_modules/tensorrt_llm/layers/activation.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -639,6 +643,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/layers/attention.html b/latest/_modules/tensorrt_llm/layers/attention.html
index a53b5ac23f..d1d370eaf2 100644
--- a/latest/_modules/tensorrt_llm/layers/attention.html
+++ b/latest/_modules/tensorrt_llm/layers/attention.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -3504,6 +3508,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/layers/cast.html b/latest/_modules/tensorrt_llm/layers/cast.html
index fd8c991724..8a50d31b0e 100644
--- a/latest/_modules/tensorrt_llm/layers/cast.html
+++ b/latest/_modules/tensorrt_llm/layers/cast.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -646,6 +650,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/layers/conv.html b/latest/_modules/tensorrt_llm/layers/conv.html
index ab22caaf95..83fc9ea691 100644
--- a/latest/_modules/tensorrt_llm/layers/conv.html
+++ b/latest/_modules/tensorrt_llm/layers/conv.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -895,6 +899,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/layers/embedding.html b/latest/_modules/tensorrt_llm/layers/embedding.html
index 33be0cfd8b..aacfd70035 100644
--- a/latest/_modules/tensorrt_llm/layers/embedding.html
+++ b/latest/_modules/tensorrt_llm/layers/embedding.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1362,6 +1366,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/layers/linear.html b/latest/_modules/tensorrt_llm/layers/linear.html
index 5cf9f0c957..f399188379 100644
--- a/latest/_modules/tensorrt_llm/layers/linear.html
+++ b/latest/_modules/tensorrt_llm/layers/linear.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1210,6 +1214,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/layers/mlp.html b/latest/_modules/tensorrt_llm/layers/mlp.html
index 37c99f6445..e5bfd99f21 100644
--- a/latest/_modules/tensorrt_llm/layers/mlp.html
+++ b/latest/_modules/tensorrt_llm/layers/mlp.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1236,6 +1240,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/layers/normalization.html b/latest/_modules/tensorrt_llm/layers/normalization.html
index 9b65eb238b..39cca5e8ac 100644
--- a/latest/_modules/tensorrt_llm/layers/normalization.html
+++ b/latest/_modules/tensorrt_llm/layers/normalization.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1000,6 +1004,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/layers/pooling.html b/latest/_modules/tensorrt_llm/layers/pooling.html
index 16ebe7e6a7..3b9b232be7 100644
--- a/latest/_modules/tensorrt_llm/layers/pooling.html
+++ b/latest/_modules/tensorrt_llm/layers/pooling.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -655,6 +659,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/llmapi/build_cache.html b/latest/_modules/tensorrt_llm/llmapi/build_cache.html
index 8ac8be5d16..211ec0ce6a 100644
--- a/latest/_modules/tensorrt_llm/llmapi/build_cache.html
+++ b/latest/_modules/tensorrt_llm/llmapi/build_cache.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -939,6 +943,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/llmapi/llm.html b/latest/_modules/tensorrt_llm/llmapi/llm.html
index 230bacb8e0..9f22875735 100644
--- a/latest/_modules/tensorrt_llm/llmapi/llm.html
+++ b/latest/_modules/tensorrt_llm/llmapi/llm.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -515,6 +519,7 @@
 <span class="kn">from</span><span class="w"> </span><span class="nn">tqdm</span><span class="w"> </span><span class="kn">import</span> <span class="n">tqdm</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">PreTrainedTokenizerBase</span>
 
+<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.builder</span><span class="w"> </span><span class="kn">import</span> <span class="n">BuildConfig</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.data</span><span class="w"> </span><span class="kn">import</span> <span class="n">TextPrompt</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.inputs.registry</span><span class="w"> </span><span class="kn">import</span> <span class="n">DefaultInputProcessor</span>
 
@@ -532,8 +537,9 @@
 <span class="kn">from</span><span class="w"> </span><span class="nn">..inputs</span><span class="w"> </span><span class="kn">import</span> <span class="n">PromptInputs</span><span class="p">,</span> <span class="n">create_input_processor</span><span class="p">,</span> <span class="n">prompt_inputs</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..sampling_params</span><span class="w"> </span><span class="kn">import</span> <span class="n">SamplingParams</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_args</span><span class="w"> </span><span class="kn">import</span> <span class="n">LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span> <span class="n">PybindMirror</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_utils</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">CachedModelLoader</span><span class="p">,</span> <span class="n">KvCacheRetentionConfig</span><span class="p">,</span> <span class="n">LlmArgs</span><span class="p">,</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_args</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">LLMARGS_EXPLICIT_DOCSTRING</span><span class="p">,</span> <span class="n">PybindMirror</span><span class="p">,</span> <span class="n">TorchLlmArgs</span><span class="p">,</span>
+                       <span class="n">TrtLlmArgs</span><span class="p">)</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">.llm_utils</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">CachedModelLoader</span><span class="p">,</span> <span class="n">KvCacheRetentionConfig</span><span class="p">,</span>
                         <span class="n">LlmBuildStats</span><span class="p">,</span> <span class="n">ModelLoader</span><span class="p">,</span> <span class="n">_ModelRuntimeContext</span><span class="p">)</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">.mpi_session</span><span class="w"> </span><span class="kn">import</span> <span class="n">MpiPoolSession</span><span class="p">,</span> <span class="n">external_mpi_comm_available</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">.tokenizer</span><span class="w"> </span><span class="kn">import</span> <span class="n">TokenizerBase</span><span class="p">,</span> <span class="n">_xgrammar_tokenizer_info</span>
@@ -625,9 +631,10 @@
         <span class="bp">self</span><span class="o">.</span><span class="n">_executor_cls</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s2">&quot;executor_cls&quot;</span><span class="p">,</span> <span class="n">GenerationExecutor</span><span class="p">)</span>
 
         <span class="k">try</span><span class="p">:</span>
-            <span class="bp">self</span><span class="o">.</span><span class="n">pytorch_backend_config</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="s1">&#39;pytorch_backend_config&#39;</span><span class="p">,</span>
-                                                     <span class="kc">None</span><span class="p">)</span>
-            <span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">LlmArgs</span><span class="o">.</span><span class="n">from_kwargs</span><span class="p">(</span>
+            <span class="n">llm_args_cls</span> <span class="o">=</span> <span class="n">TorchLlmArgs</span> <span class="k">if</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
+                <span class="s1">&#39;backend&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> <span class="o">==</span> <span class="s1">&#39;pytorch&#39;</span> <span class="k">else</span> <span class="n">TrtLlmArgs</span>
+
+            <span class="bp">self</span><span class="o">.</span><span class="n">args</span> <span class="o">=</span> <span class="n">llm_args_cls</span><span class="o">.</span><span class="n">from_kwargs</span><span class="p">(</span>
                 <span class="n">model</span><span class="o">=</span><span class="n">model</span><span class="p">,</span>
                 <span class="n">tokenizer</span><span class="o">=</span><span class="n">tokenizer</span><span class="p">,</span>
                 <span class="n">tokenizer_mode</span><span class="o">=</span><span class="n">tokenizer_mode</span><span class="p">,</span>
@@ -675,8 +682,9 @@
             <span class="c1"># Due to the Executor can only accept a engine path, we need to save the engine to a directory</span>
             <span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Path</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
             <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">GenerationExecutor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
-            <span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">TemporaryDirectory</span><span class="p">(</span>
-                <span class="n">suffix</span><span class="o">=</span><span class="s2">&quot;-llm-workspace&quot;</span><span class="p">,</span> <span class="nb">dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">workspace</span><span class="p">)</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span><span class="p">:</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">TemporaryDirectory</span><span class="p">(</span>
+                    <span class="n">suffix</span><span class="o">=</span><span class="s2">&quot;-llm-workspace&quot;</span><span class="p">,</span> <span class="nb">dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">workspace</span><span class="p">)</span>
 
             <span class="bp">self</span><span class="o">.</span><span class="n">_hf_model_dir</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Path</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
 
@@ -696,7 +704,7 @@
 
     <span class="nd">@property</span>
     <span class="k">def</span><span class="w"> </span><span class="nf">workspace</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Path</span><span class="p">:</span>
-        <span class="k">return</span> <span class="n">Path</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">Path</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_workspace</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="kc">None</span>
 
 <div class="viewcode-block" id="LLM.generate">
 <a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LLM.generate">[docs]</a>
@@ -808,10 +816,13 @@
 <span class="sd">        &quot;&quot;&quot;</span>
         <span class="n">sampling_params</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_sampling_params</span><span class="p">(</span><span class="n">sampling_params</span><span class="p">)</span>
 
-        <span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">&gt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="p">:</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                <span class="sa">f</span><span class="s2">&quot;SamplingParams.n (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">) should not exceed max_batch_size (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">)&quot;</span>
-            <span class="p">)</span>
+        <span class="c1"># With pytorch backend, py_executor has logic to handle max_tokens of 1,</span>
+        <span class="c1"># so set to 1 to avoid allocating unnecessary KV cache blocks for single request</span>
+        <span class="c1"># TODO: Also support for trt backend</span>
+        <span class="k">if</span> <span class="p">(</span><span class="n">disaggregated_params</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
+                <span class="ow">and</span> <span class="n">disaggregated_params</span><span class="o">.</span><span class="n">request_type</span> <span class="o">==</span> <span class="s2">&quot;context_only&quot;</span>
+                <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span><span class="p">):</span>
+            <span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span> <span class="o">=</span> <span class="mi">1</span>
 
         <span class="n">inputs</span> <span class="o">=</span> <span class="n">prompt_inputs</span><span class="p">(</span><span class="n">inputs</span><span class="p">)</span>
 
@@ -839,8 +850,9 @@
             <span class="n">prompt</span> <span class="o">=</span> <span class="kc">None</span>
             <span class="n">query_token_ids</span> <span class="o">=</span> <span class="n">inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;query_token_ids&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
         <span class="k">elif</span> <span class="s2">&quot;prompt&quot;</span> <span class="ow">in</span> <span class="n">inputs</span><span class="p">:</span>
-            <span class="n">prompt_token_ids</span><span class="p">,</span> <span class="n">extra_processed_inputs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="p">(</span>
-                <span class="n">inputs</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
+            <span class="k">with</span> <span class="n">nvtx_range_debug</span><span class="p">(</span><span class="s2">&quot;input_processor&quot;</span><span class="p">):</span>
+                <span class="n">prompt_token_ids</span><span class="p">,</span> <span class="n">extra_processed_inputs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="p">(</span>
+                    <span class="n">inputs</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
             <span class="n">prompt</span> <span class="o">=</span> <span class="n">inputs</span><span class="p">[</span><span class="s1">&#39;prompt&#39;</span><span class="p">]</span>
             <span class="k">if</span> <span class="n">extra_processed_inputs</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
                 <span class="n">query_token_ids</span> <span class="o">=</span> <span class="n">extra_processed_inputs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;query_token_ids&#39;</span><span class="p">)</span>
@@ -1025,10 +1037,28 @@
                 <span class="sa">f</span><span class="s2">&quot;The sum of prompt length (</span><span class="si">{</span><span class="n">prompt_len</span><span class="o">/</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">cp_size</span><span class="si">}</span><span class="s2">) and query length (</span><span class="si">{</span><span class="n">query_len</span><span class="si">}</span><span class="s2">) max_tokens (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">max_tokens</span><span class="si">}</span><span class="s2">) should not exceed &quot;</span>
                 <span class="sa">f</span><span class="s2">&quot;max_seq_len (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span><span class="si">}</span><span class="s2">)&quot;</span><span class="p">)</span>
 
-        <span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">&gt;</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                <span class="sa">f</span><span class="s2">&quot;sampling_params&#39;s n (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">) should not exceed max_beam_width (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">) when use_beam_search is True&quot;</span>
-            <span class="p">)</span>
+        <span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&gt;</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">==</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                    <span class="sa">f</span><span class="s2">&quot;sampling_params.n (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">) cannot exceed max_beam_width (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">) when use_beam_search is True&quot;</span>
+                <span class="p">)</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                    <span class="sa">f</span><span class="s2">&quot;sampling_params.best_of (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s2">) cannot exceed max_beam_width (</span><span class="si">{</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="si">}</span><span class="s2">) when use_beam_search is True&quot;</span>
+                <span class="p">)</span>
+
+        <span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_batch_size</span>
+        <span class="k">if</span> <span class="n">max_batch_size</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="n">max_batch_size</span> <span class="o">=</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
+        <span class="k">if</span> <span class="ow">not</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">use_beam_search</span> <span class="ow">and</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&gt;</span> <span class="n">max_batch_size</span><span class="p">:</span>
+            <span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span> <span class="o">==</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                    <span class="sa">f</span><span class="s2">&quot;sampling_params.n (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">) cannot exceed max_batch_size (</span><span class="si">{</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">) when use_beam_search is False&quot;</span>
+                <span class="p">)</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                    <span class="sa">f</span><span class="s2">&quot;sampling_params.best_of (</span><span class="si">{</span><span class="n">sampling_params</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s2">) cannot exceed max_batch_size (</span><span class="si">{</span><span class="n">max_batch_size</span><span class="si">}</span><span class="s2">) when use_beam_search is False&quot;</span>
+                <span class="p">)</span>
 
         <span class="k">if</span> <span class="n">sampling_params</span><span class="o">.</span><span class="n">prompt_logprobs</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">build_config</span><span class="o">.</span><span class="n">gather_context_logits</span><span class="p">:</span>
             <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
@@ -1064,11 +1094,19 @@
                                                       <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span><span class="p">)</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">tokenizer</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">input_processor</span><span class="o">.</span><span class="n">tokenizer</span>
 
-        <span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_batch_size</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
-        <span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span>
-        <span class="n">max_seq_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_seq_len</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span>
+        <span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_batch_size</span>
+        <span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_num_tokens</span>
+        <span class="n">max_seq_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_seq_len</span>
+
+        <span class="n">build_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span> <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="n">BuildConfig</span><span class="p">(</span>
+        <span class="p">)</span>
+
+        <span class="n">max_batch_size</span> <span class="o">=</span> <span class="n">max_batch_size</span> <span class="ow">or</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
+        <span class="n">max_num_tokens</span> <span class="o">=</span> <span class="n">max_num_tokens</span> <span class="ow">or</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span>
+        <span class="n">max_seq_len</span> <span class="o">=</span> <span class="n">max_seq_len</span> <span class="ow">or</span> <span class="n">build_config</span><span class="o">.</span><span class="n">max_seq_len</span>
+
         <span class="n">executor_config</span> <span class="o">=</span> <span class="n">tllm</span><span class="o">.</span><span class="n">ExecutorConfig</span><span class="p">(</span>
-            <span class="n">max_beam_width</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">,</span>
+            <span class="n">max_beam_width</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_beam_width</span><span class="p">,</span>
             <span class="n">scheduler_config</span><span class="o">=</span><span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
                 <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">scheduler_config</span><span class="p">),</span>
             <span class="n">batching_type</span><span class="o">=</span><span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">batching_type</span><span class="p">)</span>
@@ -1094,7 +1132,7 @@
         <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">peft_cache_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
             <span class="n">executor_config</span><span class="o">.</span><span class="n">peft_cache_config</span> <span class="o">=</span> <span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
                 <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">peft_cache_config</span><span class="p">)</span>
-        <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">lora_plugin</span><span class="p">:</span>
+        <span class="k">elif</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">lora_plugin</span><span class="p">:</span>
             <span class="n">engine_config</span> <span class="o">=</span> <span class="n">EngineConfig</span><span class="o">.</span><span class="n">from_json_file</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span> <span class="o">/</span>
                                                         <span class="s2">&quot;config.json&quot;</span><span class="p">)</span>
             <span class="n">lora_config</span> <span class="o">=</span> <span class="n">engine_config</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">lora_config</span>
@@ -1122,7 +1160,7 @@
         <span class="n">executor_config</span><span class="o">.</span><span class="n">normalize_log_probs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">normalize_log_probs</span>
         <span class="n">executor_config</span><span class="o">.</span><span class="n">enable_chunked_context</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">enable_chunked_prefill</span>
         <span class="n">executor_config</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">max_beam_width</span> <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_beam_width</span>
-        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">extended_runtime_perf_knob_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">extended_runtime_perf_knob_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
             <span class="n">executor_config</span><span class="o">.</span><span class="n">extended_runtime_perf_knob_config</span> <span class="o">=</span> <span class="n">PybindMirror</span><span class="o">.</span><span class="n">maybe_to_pybind</span><span class="p">(</span>
                 <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">extended_runtime_perf_knob_config</span><span class="p">)</span>
         <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">cache_transceiver_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
@@ -1132,9 +1170,11 @@
         <span class="n">update_executor_config</span><span class="p">(</span>
             <span class="n">executor_config</span><span class="p">,</span>
             <span class="n">backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span><span class="p">,</span>
-            <span class="n">pytorch_backend_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">pytorch_backend_config</span><span class="p">,</span>
+            <span class="n">pytorch_backend_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">get_pytorch_backend_config</span><span class="p">()</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s2">&quot;pytorch&quot;</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
             <span class="n">mapping</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">to_mapping</span><span class="p">(),</span>
-            <span class="n">build_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="p">,</span>
+            <span class="n">build_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
             <span class="n">speculative_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span>
             <span class="n">hf_model_dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_hf_model_dir</span><span class="p">,</span>
             <span class="n">trt_engine_dir</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span><span class="p">,</span>
@@ -1142,8 +1182,9 @@
             <span class="n">max_seq_len</span><span class="o">=</span><span class="n">max_seq_len</span><span class="p">)</span>
         <span class="n">executor_config</span><span class="o">.</span><span class="n">llm_parallel_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">parallel_config</span>
         <span class="n">return_logits</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">gather_generation_logits</span> <span class="ow">or</span> <span class="p">(</span>
-            <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_on_trt_backend</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span>
             <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">gather_context_logits</span><span class="p">)</span>
+
         <span class="bp">self</span><span class="o">.</span><span class="n">_executor</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_executor_cls</span><span class="o">.</span><span class="n">create</span><span class="p">(</span>
             <span class="bp">self</span><span class="o">.</span><span class="n">_engine_dir</span><span class="p">,</span>
             <span class="n">executor_config</span><span class="o">=</span><span class="n">executor_config</span><span class="p">,</span>
@@ -1160,6 +1201,10 @@
             <span class="n">is_llm_executor</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
             <span class="n">lora_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">lora_config</span><span class="p">)</span>
 
+    <span class="nd">@property</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">_on_trt_backend</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
+        <span class="k">return</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="p">,</span> <span class="n">TrtLlmArgs</span><span class="p">)</span>
+
     <span class="k">def</span><span class="w"> </span><span class="nf">_try_load_tokenizer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">TokenizerBase</span><span class="p">]:</span>
         <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">args</span><span class="o">.</span><span class="n">skip_tokenizer_init</span><span class="p">:</span>
             <span class="k">return</span> <span class="kc">None</span>
@@ -1379,6 +1424,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/llmapi/llm_args.html b/latest/_modules/tensorrt_llm/llmapi/llm_args.html
index 5109ab9e3e..3e41a4e02b 100644
--- a/latest/_modules/tensorrt_llm/llmapi/llm_args.html
+++ b/latest/_modules/tensorrt_llm/llmapi/llm_args.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -505,15 +509,18 @@
   <h1>Source code for tensorrt_llm.llmapi.llm_args</h1><div class="highlight"><pre>
 <span></span><span class="kn">import</span><span class="w"> </span><span class="nn">json</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">math</span>
+<span class="kn">import</span><span class="w"> </span><span class="nn">os</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">abc</span><span class="w"> </span><span class="kn">import</span> <span class="n">ABC</span><span class="p">,</span> <span class="n">abstractmethod</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">dataclasses</span><span class="w"> </span><span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">fields</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">dataclasses</span><span class="w"> </span><span class="kn">import</span> <span class="n">dataclass</span><span class="p">,</span> <span class="n">field</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">enum</span><span class="w"> </span><span class="kn">import</span> <span class="n">Enum</span><span class="p">,</span> <span class="n">EnumMeta</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">pathlib</span><span class="w"> </span><span class="kn">import</span> <span class="n">Path</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">ClassVar</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Literal</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Union</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">TYPE_CHECKING</span><span class="p">,</span> <span class="n">Any</span><span class="p">,</span> <span class="n">ClassVar</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Literal</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span>
+                    <span class="n">Union</span><span class="p">)</span>
 
 <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
 <span class="kn">import</span><span class="w"> </span><span class="nn">yaml</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">pydantic</span><span class="w"> </span><span class="kn">import</span> <span class="n">BaseModel</span><span class="p">,</span> <span class="n">Field</span><span class="p">,</span> <span class="n">validator</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">pydantic</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">BaseModel</span><span class="p">,</span> <span class="n">Field</span><span class="p">,</span> <span class="n">PrivateAttr</span><span class="p">,</span> <span class="n">field_validator</span><span class="p">,</span>
+                      <span class="n">model_validator</span><span class="p">)</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">strenum</span><span class="w"> </span><span class="kn">import</span> <span class="n">StrEnum</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">PreTrainedTokenizerBase</span>
 
@@ -522,23 +529,30 @@
 
 <span class="kn">from</span><span class="w"> </span><span class="nn">.._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">mpi_rank</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..auto_parallel</span><span class="w"> </span><span class="kn">import</span> <span class="n">AutoParallelConfig</span><span class="p">,</span> <span class="n">infer_cluster_config</span>
+
+<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
+    <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.pyexecutor.config</span><span class="w"> </span><span class="kn">import</span> <span class="n">PyTorchConfig</span>
+
 <span class="c1"># yapf: disable</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">BatchingType</span> <span class="k">as</span> <span class="n">_BatchingType</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> \
-    <span class="n">CacheTransceiverConfig</span> <span class="k">as</span> <span class="n">_CacheTransceiverConfig</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> \
-    <span class="n">CapacitySchedulerPolicy</span> <span class="k">as</span> <span class="n">_CapacitySchedulerPolicy</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">ContextChunkingPolicy</span> <span class="k">as</span> <span class="n">_ContextChunkingPolicy</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">DecodingConfig</span><span class="p">,</span> <span class="n">DecodingMode</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">DynamicBatchConfig</span> <span class="k">as</span> <span class="n">_DynamicBatchConfig</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">EagleConfig</span><span class="p">,</span> <span class="n">ExecutorConfig</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> \
-    <span class="n">ExtendedRuntimePerfKnobConfig</span> <span class="k">as</span> <span class="n">_ExtendedRuntimePerfKnobConfig</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">KvCacheConfig</span> <span class="k">as</span> <span class="n">_KvCacheConfig</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> \
-    <span class="n">LookaheadDecodingConfig</span> <span class="k">as</span> <span class="n">_LookaheadDecodingConfig</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">PeftCacheConfig</span> <span class="k">as</span> <span class="n">_PeftCacheConfig</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="n">SchedulerConfig</span> <span class="k">as</span> <span class="n">_SchedulerConfig</span>
+<span class="c1"># isort: off</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.executor</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span>
+                                 <span class="n">BatchingType</span> <span class="k">as</span> <span class="n">_BatchingType</span><span class="p">,</span>
+                                 <span class="n">CacheTransceiverConfig</span> <span class="k">as</span> <span class="n">_CacheTransceiverConfig</span><span class="p">,</span>
+                                 <span class="n">CapacitySchedulerPolicy</span> <span class="k">as</span> <span class="n">_CapacitySchedulerPolicy</span><span class="p">,</span>
+                                 <span class="n">ContextChunkingPolicy</span> <span class="k">as</span> <span class="n">_ContextChunkingPolicy</span><span class="p">,</span>
+                                 <span class="n">DecodingConfig</span><span class="p">,</span>
+                                 <span class="n">DecodingMode</span><span class="p">,</span>
+                                 <span class="n">DynamicBatchConfig</span> <span class="k">as</span> <span class="n">_DynamicBatchConfig</span><span class="p">,</span>
+                                 <span class="n">EagleConfig</span> <span class="k">as</span> <span class="n">_EagleConfig</span><span class="p">,</span>
+                                 <span class="n">ExecutorConfig</span> <span class="k">as</span> <span class="n">_ExecutorConfig</span><span class="p">,</span>
+                                 <span class="n">ExtendedRuntimePerfKnobConfig</span> <span class="k">as</span> <span class="n">_ExtendedRuntimePerfKnobConfig</span><span class="p">,</span>
+                                 <span class="n">KvCacheConfig</span> <span class="k">as</span> <span class="n">_KvCacheConfig</span><span class="p">,</span>
+                                 <span class="n">LookaheadDecodingConfig</span> <span class="k">as</span> <span class="n">_LookaheadDecodingConfig</span><span class="p">,</span>
+                                 <span class="n">PeftCacheConfig</span> <span class="k">as</span> <span class="n">_PeftCacheConfig</span><span class="p">,</span>
+                                 <span class="n">SchedulerConfig</span> <span class="k">as</span> <span class="n">_SchedulerConfig</span><span class="p">)</span> <span class="c1"># isort: skip</span>
+<span class="c1"># isort: on</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">transformers</span><span class="w"> </span><span class="kn">import</span> <span class="n">PreTrainedTokenizerBase</span>
+
 <span class="c1"># yapf: enable</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..builder</span><span class="w"> </span><span class="kn">import</span> <span class="n">BuildConfig</span><span class="p">,</span> <span class="n">EngineConfig</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
@@ -709,7 +723,8 @@
             <span class="s2">&quot;MTP&quot;</span><span class="p">:</span> <span class="n">MTPDecodingConfig</span><span class="p">,</span>
             <span class="s2">&quot;Medusa&quot;</span><span class="p">:</span> <span class="n">MedusaDecodingConfig</span><span class="p">,</span>
             <span class="s2">&quot;Eagle&quot;</span><span class="p">:</span> <span class="n">EagleDecodingConfig</span><span class="p">,</span>
-            <span class="s2">&quot;Lookahead&quot;</span><span class="p">:</span> <span class="n">LookaheadDecodingConfig</span>
+            <span class="s2">&quot;Lookahead&quot;</span><span class="p">:</span> <span class="n">LookaheadDecodingConfig</span><span class="p">,</span>
+            <span class="s2">&quot;NGram&quot;</span><span class="p">:</span> <span class="n">NGramDecodingConfig</span><span class="p">,</span>
         <span class="p">}</span>
 
         <span class="n">config_class</span> <span class="o">=</span> <span class="n">config_classes</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">decoding_type</span><span class="p">)</span>
@@ -750,6 +765,7 @@
     <span class="n">num_eagle_layers</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
     <span class="n">max_non_leaves_per_layer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
     <span class="n">pytorch_eagle_weights_path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
+    <span class="n">eagle3_one_model</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span>
 
 <div class="viewcode-block" id="EagleDecodingConfig.from_dict">
 <a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.EagleDecodingConfig.from_dict">[docs]</a>
@@ -762,6 +778,46 @@
 
 
 
+<div class="viewcode-block" id="NGramDecodingConfig">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig">[docs]</a>
+<span class="k">class</span><span class="w"> </span><span class="nc">NGramDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Configuration for NGram drafter speculative decoding.</span>
+
+<span class="sd">    Arguments:</span>
+<span class="sd">        prompt_lookup_num_tokens: int</span>
+<span class="sd">                The length maximum of draft tokens (can be understood as length maximum of output draft tokens).</span>
+
+<span class="sd">        max_matching_ngram_size: int</span>
+<span class="sd">            The length maximum of searching tokens (can be understood as length maximum of input tokens to search).</span>
+
+<span class="sd">        is_keep_all: bool = True</span>
+<span class="sd">            Whether to keep all candidate pattern-matches pairs, only one match is kept for each pattern if False.</span>
+
+<span class="sd">        is_use_oldest: bool = True</span>
+<span class="sd">            Whether to provide the oldest match when pattern is hit, the newest one is provided if False.</span>
+
+<span class="sd">        is_public_pool: bool = True</span>
+<span class="sd">            Whether to use a common pool for all requests, or the pool is private for each request if False.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
+
+    <span class="n">prompt_lookup_num_tokens</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">2</span>
+    <span class="n">max_matching_ngram_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">4</span>
+    <span class="n">is_keep_all</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
+    <span class="n">is_use_oldest</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
+    <span class="n">is_public_pool</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
+
+<div class="viewcode-block" id="NGramDecodingConfig.from_dict">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.from_dict">[docs]</a>
+    <span class="nd">@classmethod</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">from_dict</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="nb">dict</span><span class="p">):</span>
+        <span class="k">return</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">data</span><span class="p">)</span></div>
+
+
+    <span class="n">decoding_type</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;NGram&quot;</span></div>
+
+
+
 <div class="viewcode-block" id="MTPDecodingConfig">
 <a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig">[docs]</a>
 <span class="k">class</span><span class="w"> </span><span class="nc">MTPDecodingConfig</span><span class="p">(</span><span class="n">DecodingBaseConfig</span><span class="p">):</span>
@@ -1063,7 +1119,9 @@
 
 <div class="viewcode-block" id="LookaheadDecodingConfig.validate_positive_values">
 <a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values">[docs]</a>
-    <span class="nd">@validator</span><span class="p">(</span><span class="s1">&#39;max_window_size&#39;</span><span class="p">,</span> <span class="s1">&#39;max_ngram_size&#39;</span><span class="p">,</span> <span class="s1">&#39;max_verification_set_size&#39;</span><span class="p">)</span>
+    <span class="nd">@field_validator</span><span class="p">(</span><span class="s1">&#39;max_window_size&#39;</span><span class="p">,</span> <span class="s1">&#39;max_ngram_size&#39;</span><span class="p">,</span>
+                     <span class="s1">&#39;max_verification_set_size&#39;</span><span class="p">)</span>
+    <span class="nd">@classmethod</span>
     <span class="k">def</span><span class="w"> </span><span class="nf">validate_positive_values</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
         <span class="k">if</span> <span class="n">v</span> <span class="o">&lt;=</span> <span class="mi">0</span><span class="p">:</span>
             <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Value must be positive, got </span><span class="si">{</span><span class="n">v</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
@@ -1270,7 +1328,10 @@
         <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">model</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="kc">None</span>
 
 
-<span class="k">class</span><span class="w"> </span><span class="nc">LlmArgs</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
+<span class="k">class</span><span class="w"> </span><span class="nc">BaseLlmArgs</span><span class="p">(</span><span class="n">BaseModel</span><span class="p">):</span>
+<span class="w">    </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">    Base class for both TorchLlmArgs and TrtLlmArgs. It contains all the arguments that are common to both.</span>
+<span class="sd">    &quot;&quot;&quot;</span>
     <span class="n">model_config</span> <span class="o">=</span> <span class="p">{</span>
         <span class="s2">&quot;arbitrary_types_allowed&quot;</span><span class="p">:</span> <span class="kc">True</span><span class="p">,</span>
         <span class="s2">&quot;extra&quot;</span><span class="p">:</span> <span class="s2">&quot;allow&quot;</span><span class="p">,</span>
@@ -1342,20 +1403,11 @@
     <span class="n">cp_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">dict</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="nb">dict</span><span class="p">,</span>
                                       <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Context parallel config.&quot;</span><span class="p">)</span>
 
-    <span class="n">auto_parallel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
-                                <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable auto parallel mode.&quot;</span><span class="p">)</span>
-
-    <span class="n">auto_parallel_world_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
-        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The world size for auto parallel mode.&quot;</span><span class="p">)</span>
-
     <span class="n">load_format</span><span class="p">:</span> <span class="n">Literal</span><span class="p">[</span><span class="s1">&#39;auto&#39;</span><span class="p">,</span> <span class="s1">&#39;dummy&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
         <span class="n">default</span><span class="o">=</span><span class="s1">&#39;auto&#39;</span><span class="p">,</span>
         <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The format to load the model.&quot;</span><span class="p">,</span>
         <span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="s2">&quot;Literal[&#39;auto&#39;, &#39;dummy&#39;]&quot;</span><span class="p">})</span>
 
-    <span class="n">enable_tqdm</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
-                              <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable tqdm for progress bar.&quot;</span><span class="p">)</span>
-
     <span class="c1"># LoRA arguments</span>
     <span class="n">enable_lora</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable LoRA.&quot;</span><span class="p">)</span>
 
@@ -1387,18 +1439,9 @@
     <span class="n">quant_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">QuantConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
         <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Quantization config.&quot;</span><span class="p">)</span>
 
-    <span class="n">calib_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">CalibConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
-        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Calibration config.&quot;</span><span class="p">)</span>
-
-    <span class="c1"># BuildConfig is introduced to give users a familiar interface to configure the model building.</span>
-    <span class="n">build_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
-        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
-        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Build config.&quot;</span><span class="p">,</span>
-        <span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;Optional[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">]&quot;</span><span class="p">})</span>
-
     <span class="c1"># Several options from ExecutorConfig, expanded here for less hierarchy</span>
-    <span class="n">kv_cache_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">KvCacheConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
-        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;KV cache config.&quot;</span><span class="p">)</span>
+    <span class="n">kv_cache_config</span><span class="p">:</span> <span class="n">KvCacheConfig</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="n">KvCacheConfig</span><span class="p">,</span>
+                                           <span class="n">description</span><span class="o">=</span><span class="s2">&quot;KV cache config.&quot;</span><span class="p">)</span>
 
     <span class="n">enable_chunked_prefill</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
                                          <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable chunked prefill.&quot;</span><span class="p">)</span>
@@ -1421,29 +1464,12 @@
         <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
         <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The maximum number of iterations for request stats.&quot;</span><span class="p">)</span>
 
-    <span class="n">workspace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
-                                     <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The workspace for the model.&quot;</span><span class="p">)</span>
-
     <span class="c1"># A handful of options from PretrainedConfig</span>
-    <span class="n">embedding_parallel_mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
-        <span class="n">default</span><span class="o">=</span><span class="s1">&#39;SHARDING_ALONG_VOCAB&#39;</span><span class="p">,</span>
-        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The embedding parallel mode.&quot;</span><span class="p">)</span>
-
-    <span class="n">fast_build</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable fast build.&quot;</span><span class="p">)</span>
-
-    <span class="c1"># Once set, the model will reuse the build_cache</span>
-    <span class="n">enable_build_cache</span><span class="p">:</span> <span class="nb">object</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
-        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
-        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable build cache.&quot;</span><span class="p">,</span>
-        <span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span>
-            <span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;Union[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildCacheConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">, bool]&quot;</span>
-        <span class="p">})</span>
-
     <span class="n">peft_cache_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">PeftCacheConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
         <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;PEFT cache config.&quot;</span><span class="p">)</span>
 
-    <span class="n">scheduler_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">SchedulerConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
-        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Scheduler config.&quot;</span><span class="p">)</span>
+    <span class="n">scheduler_config</span><span class="p">:</span> <span class="n">SchedulerConfig</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="n">SchedulerConfig</span><span class="p">,</span>
+                                              <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Scheduler config.&quot;</span><span class="p">)</span>
 
     <span class="n">cache_transceiver_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">CacheTransceiverConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
         <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Cache transceiver config.&quot;</span><span class="p">)</span>
@@ -1451,8 +1477,8 @@
     <span class="c1"># Speculative decoding parameters</span>
     <span class="n">speculative_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span>
         <span class="n">LookaheadDecodingConfig</span><span class="p">,</span> <span class="n">MedusaDecodingConfig</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">,</span>
-        <span class="n">MTPDecodingConfig</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
-                                    <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Speculative decoding config.&quot;</span><span class="p">)</span>
+        <span class="n">MTPDecodingConfig</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+            <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Speculative decoding config.&quot;</span><span class="p">)</span>
 
     <span class="n">batching_type</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">BatchingType</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
                                                   <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Batching type.&quot;</span><span class="p">)</span>
@@ -1460,13 +1486,6 @@
     <span class="n">normalize_log_probs</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
         <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Normalize log probabilities.&quot;</span><span class="p">)</span>
 
-    <span class="n">gather_generation_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
-        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Gather generation logits.&quot;</span><span class="p">)</span>
-
-    <span class="n">extended_runtime_perf_knob_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
-        <span class="n">ExtendedRuntimePerfKnobConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
-            <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Extended runtime perf knob config.&quot;</span><span class="p">)</span>
-
     <span class="n">max_batch_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
                                           <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The maximum batch size.&quot;</span><span class="p">)</span>
 
@@ -1487,6 +1506,9 @@
                                    <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The backend to use.&quot;</span><span class="p">,</span>
                                    <span class="n">exclude</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
 
+    <span class="n">gather_generation_logits</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Gather generation logits.&quot;</span><span class="p">)</span>
+
     <span class="c1"># private fields those are unstable and just for internal use</span>
     <span class="n">num_postprocess_workers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
         <span class="n">default</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span>
@@ -1559,40 +1581,19 @@
             <span class="n">moe_tp_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_tensor_parallel_size</span><span class="p">,</span>
             <span class="n">moe_ep_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_expert_parallel_size</span><span class="p">,</span>
             <span class="n">enable_attention_dp</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_attention_dp</span><span class="p">,</span>
-            <span class="n">cp_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cp_config</span><span class="p">,</span>
-            <span class="n">auto_parallel</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel</span><span class="p">)</span>
-        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">auto_parallel</span><span class="p">:</span>
-            <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">world_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel_world_size</span>
-
-        <span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel_config</span> <span class="o">=</span> <span class="n">AutoParallelConfig</span><span class="p">(</span>
-            <span class="n">sharded_io_allowlist</span><span class="o">=</span><span class="p">[</span>
-                <span class="s2">&quot;past_key_value_</span><span class="se">\\</span><span class="s2">d+&quot;</span><span class="p">,</span>
-                <span class="s2">&quot;present_key_value_</span><span class="se">\\</span><span class="s2">d*&quot;</span><span class="p">,</span>
-            <span class="p">],</span>
-            <span class="n">same_buffer_io</span><span class="o">=</span><span class="p">{</span>
-                <span class="s2">&quot;past_key_value_(</span><span class="se">\\</span><span class="s2">d+)&quot;</span><span class="p">:</span> <span class="s2">&quot;present_key_value_</span><span class="se">\\</span><span class="s2">1&quot;</span><span class="p">,</span>
-            <span class="p">},</span>
-            <span class="o">**</span><span class="n">infer_cluster_config</span><span class="p">(),</span>
-        <span class="p">)</span>
-
-        <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span> <span class="ow">or</span> <span class="n">KvCacheConfig</span><span class="p">()</span>
-
-        <span class="bp">self</span><span class="o">.</span><span class="n">scheduler_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">scheduler_config</span> <span class="ow">or</span> <span class="n">SchedulerConfig</span><span class="p">()</span>
-
-        <span class="c1"># This is used to hold th options for convert_checkpoint</span>
-        <span class="bp">self</span><span class="o">.</span><span class="n">_convert_checkpoint_options</span> <span class="o">=</span> <span class="p">{}</span>
+            <span class="n">cp_config</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cp_config</span><span class="p">)</span>
 
     <span class="nd">@classmethod</span>
-    <span class="k">def</span><span class="w"> </span><span class="nf">from_kwargs</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;LlmArgs&quot;</span><span class="p">:</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">from_kwargs</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;BaseLlmArgs&quot;</span><span class="p">:</span>
 <span class="w">        </span><span class="sd">&quot;&quot;&quot;Create `LlmArgs` instance from kwargs.</span>
 
 <span class="sd">        Args:</span>
 <span class="sd">            kwargs (Any): Arguments passed to `LlmArgs` constructor.</span>
 
 <span class="sd">        Returns:</span>
-<span class="sd">            tensorrt_llm.llmapi.llm_utils.LlmArgs: The `LlmArgs` instance.</span>
+<span class="sd">            tensorrt_llm.llmapi.llm_utils.BaseLlmArgs: The `BaseLlmArgs` instance.</span>
 <span class="sd">        &quot;&quot;&quot;</span>
-        <span class="n">kwargs</span> <span class="o">=</span> <span class="n">LlmArgs</span><span class="o">.</span><span class="n">_maybe_update_config_for_consistency</span><span class="p">(</span><span class="nb">dict</span><span class="p">(</span><span class="n">kwargs</span><span class="p">))</span>
+        <span class="n">kwargs</span> <span class="o">=</span> <span class="n">BaseLlmArgs</span><span class="o">.</span><span class="n">_maybe_update_config_for_consistency</span><span class="p">(</span><span class="nb">dict</span><span class="p">(</span><span class="n">kwargs</span><span class="p">))</span>
         <span class="n">ret</span> <span class="o">=</span> <span class="bp">cls</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
         <span class="n">ret</span><span class="o">.</span><span class="n">_setup</span><span class="p">()</span>
         <span class="k">return</span> <span class="n">ret</span>
@@ -1603,8 +1604,7 @@
 <span class="sd">        Returns:</span>
 <span class="sd">            dict: The dict that contains all fields of the `LlmArgs` instance.</span>
 <span class="sd">        &quot;&quot;&quot;</span>
-        <span class="k">return</span> <span class="nb">dict</span><span class="p">(</span>
-            <span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">,</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">fields</span><span class="p">(</span><span class="bp">self</span><span class="p">))</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_dump</span><span class="p">()</span>
 
     <span class="nd">@staticmethod</span>
     <span class="k">def</span><span class="w"> </span><span class="nf">_maybe_update_config_for_consistency</span><span class="p">(</span>
@@ -1612,18 +1612,18 @@
         <span class="c1"># max_beam_width is not included since vague behavior due to lacking the support for dynamic beam width during</span>
         <span class="c1"># generation</span>
         <span class="n">black_list</span> <span class="o">=</span> <span class="nb">set</span><span class="p">([</span><span class="s2">&quot;max_beam_width&quot;</span><span class="p">])</span>
-        <span class="n">executor_config_attrs</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">attr</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="nb">dir</span><span class="p">(</span><span class="n">ExecutorConfig</span><span class="p">)</span>
-                                    <span class="k">if</span> <span class="ow">not</span> <span class="n">attr</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s1">&#39;_&#39;</span><span class="p">)</span>
-                                    <span class="ow">and</span> <span class="nb">callable</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="n">ExecutorConfig</span><span class="p">,</span> <span class="n">attr</span><span class="p">)))</span>
+        <span class="n">executor_config_attrs</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span>
+            <span class="n">attr</span> <span class="k">for</span> <span class="n">attr</span> <span class="ow">in</span> <span class="nb">dir</span><span class="p">(</span><span class="n">_ExecutorConfig</span><span class="p">)</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">attr</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s1">&#39;_&#39;</span><span class="p">)</span>
+            <span class="ow">and</span> <span class="nb">callable</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="n">_ExecutorConfig</span><span class="p">,</span> <span class="n">attr</span><span class="p">)))</span>
         <span class="n">executor_config_attrs</span> <span class="o">-=</span> <span class="n">black_list</span>
-        <span class="n">llm_args_attr</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">LlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
-        <span class="c1"># NOTE: When cpp ExecutorConfig add new options, please add the new options into `_LlmArgs` with docs as well</span>
+        <span class="n">llm_args_attr</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">BaseLlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="o">.</span><span class="n">keys</span><span class="p">())</span>
+        <span class="c1"># NOTE: When cpp ExecutorConfig add new options, please add the new options into `LlmArgs` with docs as well</span>
         <span class="c1"># ASK chunweiy for help if you are not sure about the new options.</span>
         <span class="k">assert</span> <span class="n">executor_config_attrs</span><span class="o">.</span><span class="n">issubset</span><span class="p">(</span>
             <span class="n">llm_args_attr</span>
         <span class="p">),</span> <span class="sa">f</span><span class="s2">&quot;New options found in underlying ExecutorConfig: </span><span class="si">{</span><span class="n">llm_args_attr</span><span class="w"> </span><span class="o">-</span><span class="w"> </span><span class="n">executor_config_attrs</span><span class="si">}</span><span class="s2">&quot;</span>
 
-        <span class="c1"># ensure build_config and LlmArgs consistency</span>
+        <span class="c1"># ensure build_config and LlmArgsBase consistency</span>
         <span class="k">if</span> <span class="n">kwargs_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;backend&quot;</span><span class="p">)</span> <span class="o">!=</span> <span class="s2">&quot;pytorch&quot;</span> <span class="ow">and</span> <span class="n">kwargs_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
                 <span class="s2">&quot;build_config&quot;</span><span class="p">):</span>
             <span class="c1"># TODO: move this to _perform_config_arbitration() once it&#39;s default-on.</span>
@@ -1633,11 +1633,11 @@
                 <span class="n">build_val</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">kwargs_dict</span><span class="p">[</span><span class="s2">&quot;build_config&quot;</span><span class="p">],</span> <span class="n">field_name</span><span class="p">,</span>
                                     <span class="kc">None</span><span class="p">)</span>
                 <span class="n">llmargs_val</span> <span class="o">=</span> <span class="n">kwargs_dict</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
-                    <span class="n">field_name</span><span class="p">)</span> <span class="ow">or</span> <span class="n">LlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="n">field_name</span><span class="p">]</span>
+                    <span class="n">field_name</span><span class="p">)</span> <span class="ow">or</span> <span class="n">BaseLlmArgs</span><span class="o">.</span><span class="n">model_fields</span><span class="p">[</span><span class="n">field_name</span><span class="p">]</span>
 
                 <span class="k">if</span> <span class="n">build_val</span> <span class="o">!=</span> <span class="n">llmargs_val</span><span class="p">:</span>
                     <span class="n">logger</span><span class="o">.</span><span class="n">warning</span><span class="p">(</span>
-                        <span class="sa">f</span><span class="s2">&quot;Overriding LlmArgs.</span><span class="si">{</span><span class="n">field_name</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">llmargs_val</span><span class="si">}</span><span class="s2">) with build_config.</span><span class="si">{</span><span class="n">field_name</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">build_val</span><span class="si">}</span><span class="s2">).&quot;</span>
+                        <span class="sa">f</span><span class="s2">&quot;Overriding LlmArgsBase.</span><span class="si">{</span><span class="n">field_name</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">llmargs_val</span><span class="si">}</span><span class="s2">) with build_config.</span><span class="si">{</span><span class="n">field_name</span><span class="si">}</span><span class="s2"> (</span><span class="si">{</span><span class="n">build_val</span><span class="si">}</span><span class="s2">).&quot;</span>
                     <span class="p">)</span>
                     <span class="n">kwargs_dict</span><span class="p">[</span><span class="n">field_name</span><span class="p">]</span> <span class="o">=</span> <span class="n">build_val</span>
 
@@ -1646,12 +1646,15 @@
     <span class="k">def</span><span class="w"> </span><span class="nf">_setup</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
 <span class="w">        </span><span class="sd">&#39;&#39;&#39; This method will setup the configs right before building the model. &#39;&#39;&#39;</span>
 
+        <span class="n">is_trt_llm_args</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">TrtLlmArgs</span><span class="p">)</span>
+
         <span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="p">,</span>
                           <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Path</span><span class="p">)),</span> <span class="sa">f</span><span class="s2">&quot;Invalid model: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">model</span><span class="si">}</span><span class="s2">&quot;</span>
 
-        <span class="bp">self</span><span class="o">.</span><span class="n">_setup_embedding_parallel_mode</span><span class="p">()</span>
+        <span class="k">if</span> <span class="n">is_trt_llm_args</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">_setup_embedding_parallel_mode</span><span class="p">()</span>
 
-        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span><span class="p">:</span>
+        <span class="k">if</span> <span class="n">is_trt_llm_args</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span><span class="p">:</span>
             <span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span> <span class="o">=</span> <span class="n">BuildCacheConfig</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span>
                 <span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="k">else</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span>
             <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_build_cache</span><span class="p">,</span> <span class="n">BuildCacheConfig</span><span class="p">):</span>
@@ -1692,7 +1695,8 @@
 
         <span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span> <span class="ow">or</span> <span class="n">QuantConfig</span><span class="p">()</span>
 
-        <span class="bp">self</span><span class="o">.</span><span class="n">calib_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">calib_config</span> <span class="ow">or</span> <span class="n">CalibConfig</span><span class="p">()</span>
+        <span class="k">if</span> <span class="n">is_trt_llm_args</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">calib_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">calib_config</span> <span class="ow">or</span> <span class="n">CalibConfig</span><span class="p">()</span>
 
         <span class="c1"># Note: max_batch_size and max_num_tokens in LlmArgs are for runtime,</span>
         <span class="c1"># which will be passed to the C++ Executor API, overwriting the values</span>
@@ -1719,8 +1723,9 @@
                 <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_num_tokens</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_num_tokens</span>
 
         <span class="c1"># TODO: remove the checker when manage weights support all data types</span>
-        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">fast_build</span> <span class="ow">and</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span> <span class="ow">is</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">FP8</span>
-                                <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">):</span>
+        <span class="k">if</span> <span class="n">is_trt_llm_args</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">fast_build</span> <span class="ow">and</span> <span class="p">(</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span> <span class="ow">is</span> <span class="n">QuantAlgo</span><span class="o">.</span><span class="n">FP8</span>
+                <span class="ow">or</span> <span class="bp">self</span><span class="o">.</span><span class="n">quant_config</span><span class="o">.</span><span class="n">quant_algo</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">):</span>
             <span class="bp">self</span><span class="o">.</span><span class="n">_update_plugin_config</span><span class="p">(</span><span class="s2">&quot;manage_weights&quot;</span><span class="p">,</span> <span class="kc">True</span><span class="p">)</span>
 
         <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">_world_size</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
@@ -1733,9 +1738,12 @@
             <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_lora_rank</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
                 <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">lora_config</span><span class="o">.</span><span class="n">max_lora_rank</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_lora_rank</span>
 
+        <span class="bp">self</span><span class="o">.</span><span class="n">_setup_speculative_config</span><span class="p">()</span>
+
         <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">enable_prompt_adapter</span><span class="p">:</span>
             <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_prompt_embedding_table_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">max_prompt_adapter_token</span> <span class="o">*</span> <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_batch_size</span>
 
+    <span class="k">def</span><span class="w"> </span><span class="nf">_setup_speculative_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
         <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">:</span>
             <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">LookaheadDecodingConfig</span><span class="p">):</span>
                 <span class="n">lookahead_config</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span>
@@ -1765,7 +1773,7 @@
                 <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
 
                 <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">!=</span> <span class="s1">&#39;pytorch&#39;</span><span class="p">:</span>
-                    <span class="n">eagle_config</span> <span class="o">=</span> <span class="n">EagleConfig</span><span class="p">(</span>
+                    <span class="n">eagle_config</span> <span class="o">=</span> <span class="n">_EagleConfig</span><span class="p">(</span>
                         <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">eagle_choices</span><span class="p">,</span>
                         <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">greedy_sampling</span><span class="p">,</span>
                         <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">posterior_threshold</span><span class="p">,</span>
@@ -1778,9 +1786,25 @@
                     <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">Eagle3Config</span>
                     <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">Eagle3Config</span><span class="p">(</span>
                         <span class="n">max_draft_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span><span class="p">,</span>
-                        <span class="n">eagle_weights_path</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
-                        <span class="n">pytorch_eagle_weights_path</span><span class="p">)</span>
-
+                        <span class="n">draft_model_path</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
+                        <span class="n">pytorch_eagle_weights_path</span><span class="p">,</span>
+                        <span class="n">eagle3_one_model</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
+                        <span class="n">eagle3_one_model</span><span class="p">)</span>
+            <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">NGramDecodingConfig</span><span class="p">):</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">=</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NGRAM</span>
+                <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">backend</span> <span class="o">==</span> <span class="s1">&#39;pytorch&#39;</span>
+                <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">prompt_lookup_num_tokens</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_matching_ngram_size</span> <span class="o">&gt;</span> <span class="mi">0</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">build_config</span><span class="o">.</span><span class="n">max_draft_len</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">max_draft_len</span>
+                <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">NGramConfig</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">NGramConfig</span><span class="p">(</span>
+                    <span class="n">prompt_lookup_num_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
+                    <span class="n">prompt_lookup_num_tokens</span><span class="p">,</span>
+                    <span class="n">max_matching_ngram_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span>
+                    <span class="n">max_matching_ngram_size</span><span class="p">,</span>
+                    <span class="n">is_keep_all</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_keep_all</span><span class="p">,</span>
+                    <span class="n">is_use_oldest</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_use_oldest</span><span class="p">,</span>
+                    <span class="n">is_public_pool</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="o">.</span><span class="n">is_public_pool</span><span class="p">,</span>
+                <span class="p">)</span>
             <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span><span class="p">,</span> <span class="n">MTPDecodingConfig</span><span class="p">):</span>
                 <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.speculative</span><span class="w"> </span><span class="kn">import</span> <span class="n">MTPConfig</span>
                 <span class="bp">self</span><span class="o">.</span><span class="n">speculative_config</span> <span class="o">=</span> <span class="n">MTPConfig</span><span class="p">(</span>
@@ -1921,32 +1945,409 @@
                 <span class="sa">f</span><span class="s2">&quot;Invalid embedding_parallel_mode: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">llm_args</span><span class="o">.</span><span class="n">embedding_parallel_mode</span><span class="si">}</span><span class="s2">&quot;</span>
             <span class="p">)</span>
 
-    <span class="k">def</span><span class="w"> </span><span class="nf">_validate_kv_cache_config</span><span class="p">(</span><span class="bp">self</span><span class="p">):</span>
-        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;KvCacheConfig is required for streaming LLM.&quot;</span><span class="p">)</span>
 
-        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">max_attention_window</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                <span class="s2">&quot;KvCacheConfig.max_attention_window should be set for streaming LLM.&quot;</span>
-            <span class="p">)</span>
-        <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="n">i</span> <span class="o">&lt;=</span> <span class="mi">0</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">max_attention_window</span><span class="p">):</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                <span class="s2">&quot;Elements in KvCacheConfig.max_attention_window should be greater than 0.&quot;</span>
-            <span class="p">)</span>
+<div class="viewcode-block" id="TrtLlmArgs">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs">[docs]</a>
+<span class="k">class</span><span class="w"> </span><span class="nc">TrtLlmArgs</span><span class="p">(</span><span class="n">BaseLlmArgs</span><span class="p">):</span>
 
-        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">sink_token_length</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                <span class="s2">&quot;KvCacheConfig.sink_token_length should be set for streaming LLM.&quot;</span>
-            <span class="p">)</span>
-        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_config</span><span class="o">.</span><span class="n">sink_token_length</span> <span class="o">&lt;=</span> <span class="mi">0</span><span class="p">:</span>
-            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                <span class="s2">&quot;KvCacheConfig.sink_token_length should be greater than 0.&quot;</span><span class="p">)</span>
+    <span class="n">auto_parallel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable auto parallel mode.&quot;</span><span class="p">,</span>
+        <span class="n">deprecated</span><span class="o">=</span>
+        <span class="s2">&quot;Use tensor_parallel_size/pipeline_parallel_size/xxx_parallel_size instead.&quot;</span><span class="p">,</span>
+    <span class="p">)</span>
 
+    <span class="n">auto_parallel_world_size</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The world size for auto parallel mode.&quot;</span><span class="p">,</span>
+        <span class="n">deprecated</span><span class="o">=</span>
+        <span class="s2">&quot;Use tensor_parallel_size/pipeline_parallel_size/xxx_parallel_size instead.&quot;</span><span class="p">,</span>
+    <span class="p">)</span>
+
+    <span class="n">enable_tqdm</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+                              <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable tqdm for progress bar.&quot;</span><span class="p">)</span>
+
+    <span class="c1"># BuildConfig is introduced to give users a familiar interface to configure the model building.</span>
+    <span class="n">build_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Build config.&quot;</span><span class="p">,</span>
+        <span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;Optional[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">]&quot;</span><span class="p">})</span>
+
+    <span class="n">workspace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+                                     <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The workspace for the model.&quot;</span><span class="p">)</span>
+
+    <span class="c1"># Once set, the model will reuse the build_cache</span>
+    <span class="n">enable_build_cache</span><span class="p">:</span> <span class="nb">object</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable build cache.&quot;</span><span class="p">,</span>
+        <span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span>
+            <span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;Union[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildCacheConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">, bool]&quot;</span>
+        <span class="p">})</span>
+
+    <span class="n">extended_runtime_perf_knob_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
+        <span class="n">ExtendedRuntimePerfKnobConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+            <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Extended runtime perf knob config.&quot;</span><span class="p">)</span>
+
+    <span class="n">calib_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">CalibConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Calibration config.&quot;</span><span class="p">)</span>
+
+    <span class="n">embedding_parallel_mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="s1">&#39;SHARDING_ALONG_VOCAB&#39;</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;The embedding parallel mode.&quot;</span><span class="p">)</span>
+
+    <span class="n">fast_build</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable fast build.&quot;</span><span class="p">)</span>
+
+    <span class="c1"># Private attributes</span>
+    <span class="n">_auto_parallel_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">AutoParallelConfig</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">)</span>
+    <span class="c1"># This is used to hold the options for convert_checkpoint</span>
+    <span class="n">_convert_checkpoint_options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span>
+                                      <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="nb">dict</span><span class="p">)</span>
+
+    <span class="nd">@property</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">auto_parallel_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">AutoParallelConfig</span><span class="p">:</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_auto_parallel_config</span>
+
+<div class="viewcode-block" id="TrtLlmArgs.model_post_init">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.model_post_init">[docs]</a>
+    <span class="nd">@print_traceback_on_error</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">model_post_init</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__context</span><span class="p">):</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">model_post_init</span><span class="p">(</span><span class="n">__context</span><span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">_auto_parallel_config</span> <span class="o">=</span> <span class="n">AutoParallelConfig</span><span class="p">(</span>
+            <span class="n">sharded_io_allowlist</span><span class="o">=</span><span class="p">[</span>
+                <span class="s2">&quot;past_key_value_</span><span class="se">\\</span><span class="s2">d+&quot;</span><span class="p">,</span>
+                <span class="s2">&quot;present_key_value_</span><span class="se">\\</span><span class="s2">d*&quot;</span><span class="p">,</span>
+            <span class="p">],</span>
+            <span class="n">same_buffer_io</span><span class="o">=</span><span class="p">{</span>
+                <span class="s2">&quot;past_key_value_(</span><span class="se">\\</span><span class="s2">d+)&quot;</span><span class="p">:</span> <span class="s2">&quot;present_key_value_</span><span class="se">\\</span><span class="s2">1&quot;</span><span class="p">,</span>
+            <span class="p">},</span>
+            <span class="o">**</span><span class="n">infer_cluster_config</span><span class="p">(),</span>
+        <span class="p">)</span>
+
+        <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">auto_parallel</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel</span>
+
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">auto_parallel</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">parallel_config</span><span class="o">.</span><span class="n">world_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_parallel_world_size</span></div>
+</div>
+
+
+
+<span class="n">LlmArgs</span> <span class="o">=</span> <span class="n">TrtLlmArgs</span>
 
 <span class="n">LLMARGS_EXPLICIT_DOCSTRING</span> <span class="o">=</span> <span class="n">generate_api_docs_as_docstring</span><span class="p">(</span><span class="n">LlmArgs</span><span class="p">,</span>
                                                             <span class="n">indent</span><span class="o">=</span><span class="s1">&#39; &#39;</span> <span class="o">*</span> <span class="mi">4</span><span class="p">)</span>
 
 
+<span class="k">class</span><span class="w"> </span><span class="nc">LoadFormat</span><span class="p">(</span><span class="n">Enum</span><span class="p">):</span>
+    <span class="n">AUTO</span> <span class="o">=</span> <span class="mi">0</span>
+    <span class="c1"># Initialize all weights randomly.</span>
+    <span class="n">DUMMY</span> <span class="o">=</span> <span class="mi">1</span>
+
+
+<div class="viewcode-block" id="TorchLlmArgs">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs">[docs]</a>
+<span class="k">class</span><span class="w"> </span><span class="nc">TorchLlmArgs</span><span class="p">(</span><span class="n">BaseLlmArgs</span><span class="p">):</span>
+
+    <span class="c1"># Just a dummy BuildConfig to allow code reuse with the TrtLlmArgs</span>
+    <span class="n">build_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Build config.&quot;</span><span class="p">,</span>
+        <span class="n">exclude_from_json</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+        <span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;Optional[</span><span class="si">{</span><span class="n">get_type_repr</span><span class="p">(</span><span class="n">BuildConfig</span><span class="p">)</span><span class="si">}</span><span class="s2">]&quot;</span><span class="p">})</span>
+
+    <span class="c1"># PyTorch backend specific configurations</span>
+
+    <span class="n">use_cuda_graph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span>
+        <span class="s2">&quot;If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_batch_sizes, and are enabled for batches that consist of decoding requests *only* (the reason is that it&#39;s hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory.&quot;</span>
+    <span class="p">)</span>
+
+    <span class="n">cuda_graph_batch_sizes</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;List of batch sizes to create CUDA graphs for.&quot;</span><span class="p">)</span>
+
+    <span class="n">cuda_graph_max_batch_size</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="mi">0</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Maximum batch size for CUDA graphs.&quot;</span><span class="p">)</span>
+
+    <span class="n">cuda_graph_padding_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span>
+        <span class="s2">&quot;If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance.&quot;</span>
+    <span class="p">)</span>
+
+    <span class="n">disable_overlap_scheduler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Disable the overlap scheduler.&quot;</span><span class="p">)</span>
+
+    <span class="n">moe_max_num_tokens</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span>
+        <span class="s2">&quot;If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used.&quot;</span>
+    <span class="p">)</span>
+
+    <span class="n">moe_load_balancer</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">object</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Configuration for MoE load balancing.&quot;</span><span class="p">,</span>
+        <span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="s2">&quot;Union[MoeLoadBalancerConfig, str]&quot;</span><span class="p">})</span>
+
+    <span class="n">attn_backend</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">&#39;TRTLLM&#39;</span><span class="p">,</span>
+                              <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Attention backend to use.&quot;</span><span class="p">)</span>
+
+    <span class="n">moe_backend</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s1">&#39;CUTLASS&#39;</span><span class="p">,</span>
+                             <span class="n">description</span><span class="o">=</span><span class="s2">&quot;MoE backend to use.&quot;</span><span class="p">)</span>
+
+    <span class="n">mixed_sampler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span>
+        <span class="s2">&quot;If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc.&quot;</span>
+    <span class="p">)</span>
+
+    <span class="n">enable_trtllm_sampler</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span>
+        <span class="s2">&quot;If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies.&quot;</span>
+    <span class="p">)</span>
+
+    <span class="n">kv_cache_dtype</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="s2">&quot;auto&quot;</span><span class="p">,</span>
+                                <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Data type for KV cache.&quot;</span><span class="p">)</span>
+
+    <span class="n">use_kv_cache</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+                               <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Whether to use KV cache.&quot;</span><span class="p">)</span>
+
+    <span class="n">enable_iter_perf_stats</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable iteration performance statistics.&quot;</span><span class="p">)</span>
+
+    <span class="n">enable_iter_req_stats</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span>
+        <span class="s2">&quot;If true, enables per request stats per iteration. Must also set enable_iter_perf_stats to true to get request stats.&quot;</span>
+    <span class="p">)</span>
+
+    <span class="n">print_iter_log</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span><span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+                                 <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Print iteration logs.&quot;</span><span class="p">)</span>
+
+    <span class="n">torch_compile_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable torch.compile optimization.&quot;</span><span class="p">)</span>
+
+    <span class="n">torch_compile_fullgraph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable full graph compilation in torch.compile.&quot;</span><span class="p">)</span>
+
+    <span class="n">torch_compile_inductor_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable inductor backend in torch.compile.&quot;</span><span class="p">)</span>
+
+    <span class="n">torch_compile_piecewise_cuda_graph</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable piecewise CUDA graph in torch.compile.&quot;</span><span class="p">)</span>
+
+    <span class="n">torch_compile_enable_userbuffers</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span>
+        <span class="s2">&quot;When torch compile is enabled, userbuffers is enabled by default.&quot;</span><span class="p">)</span>
+
+    <span class="n">autotuner_enabled</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Enable autotuner only when torch compile is enabled.&quot;</span><span class="p">)</span>
+
+    <span class="n">enable_layerwise_nvtx_marker</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> <span class="n">description</span><span class="o">=</span><span class="s2">&quot;If true, enable layerwise nvtx marker.&quot;</span><span class="p">)</span>
+
+    <span class="n">auto_deploy_config</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span><span class="s2">&quot;Auto deploy config.&quot;</span><span class="p">,</span>
+        <span class="n">exclude_from_json</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+        <span class="n">json_schema_extra</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;type&quot;</span><span class="p">:</span> <span class="sa">f</span><span class="s2">&quot;Optional[AutoDeployConfig]&quot;</span><span class="p">})</span>
+
+    <span class="n">load_format</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">LoadFormat</span><span class="p">]</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="n">LoadFormat</span><span class="o">.</span><span class="n">AUTO</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span>
+        <span class="s2">&quot;How to load the model weights. By default, detect the weight type from the model checkpoint.&quot;</span>
+    <span class="p">)</span>
+
+    <span class="n">enable_min_latency</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="n">Field</span><span class="p">(</span>
+        <span class="n">default</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+        <span class="n">description</span><span class="o">=</span>
+        <span class="s2">&quot;If true, enable min-latency mode. Currently only used for Llama4.&quot;</span><span class="p">,</span>
+    <span class="p">)</span>
+
+<div class="viewcode-block" id="TorchLlmArgs.convert_load_format">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.convert_load_format">[docs]</a>
+    <span class="nd">@field_validator</span><span class="p">(</span><span class="s1">&#39;load_format&#39;</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">&#39;before&#39;</span><span class="p">)</span>
+    <span class="nd">@classmethod</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">convert_load_format</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="n">LoadFormat</span><span class="p">):</span>
+            <span class="k">return</span> <span class="n">v</span>
+        <span class="n">load_format</span> <span class="o">=</span> <span class="n">v</span><span class="o">.</span><span class="n">upper</span><span class="p">()</span>
+        <span class="k">if</span> <span class="n">load_format</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">LoadFormat</span><span class="o">.</span><span class="n">__members__</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Invalid LoadFormat: </span><span class="si">{</span><span class="n">v</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">LoadFormat</span><span class="p">[</span><span class="n">load_format</span><span class="p">]</span></div>
+
+
+    <span class="c1"># Extra resource managers to use in addition to the KV cache manager.</span>
+    <span class="c1"># Each manager&#39;s prepare_resources method is called before the forward pass,</span>
+    <span class="c1"># and update_resources() is called after the pass finishes. free_resources()</span>
+    <span class="c1"># is called when a request finishes. The KV cache manager is guaranteed to</span>
+    <span class="c1"># be invoked after all of these extra managers in all stages.</span>
+    <span class="n">_extra_resource_managers</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span>
+                                   <span class="nb">object</span><span class="p">]</span> <span class="o">=</span> <span class="n">PrivateAttr</span><span class="p">(</span><span class="n">default_factory</span><span class="o">=</span><span class="nb">dict</span><span class="p">,</span> <span class="p">)</span>
+
+    <span class="nd">@property</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">extra_resource_managers</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">]:</span>
+        <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_extra_resource_managers</span>
+
+    <span class="nd">@extra_resource_managers</span><span class="o">.</span><span class="n">setter</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">extra_resource_managers</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">object</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">_extra_resource_managers</span> <span class="o">=</span> <span class="n">value</span>
+
+<div class="viewcode-block" id="TorchLlmArgs.model_post_init">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.model_post_init">[docs]</a>
+    <span class="nd">@print_traceback_on_error</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">model_post_init</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__context</span><span class="p">):</span>
+        <span class="kn">from</span><span class="w"> </span><span class="nn">.._torch.model_config</span><span class="w"> </span><span class="kn">import</span> <span class="n">MoeLoadBalancerConfig</span>
+
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">model_post_init</span><span class="p">(</span><span class="n">__context</span><span class="p">)</span>
+        <span class="bp">self</span><span class="o">.</span><span class="n">model_format</span> <span class="o">=</span> <span class="n">_ModelFormatKind</span><span class="o">.</span><span class="n">HF</span>
+
+        <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
+            <span class="k">if</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">exists</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">):</span>
+                <span class="k">raise</span> <span class="ne">FileNotFoundError</span><span class="p">(</span>
+                    <span class="sa">f</span><span class="s2">&quot;MoE load balancer config file not found: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="si">}</span><span class="s2">&quot;</span>
+                <span class="p">)</span>
+            <span class="k">try</span><span class="p">:</span>
+                <span class="k">with</span> <span class="nb">open</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">)</span> <span class="k">as</span> <span class="n">f</span><span class="p">:</span>
+                    <span class="n">moe_load_balancer_config</span> <span class="o">=</span> <span class="n">yaml</span><span class="o">.</span><span class="n">safe_load</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span> <span class="o">=</span> <span class="n">MoeLoadBalancerConfig</span><span class="p">(</span>
+                    <span class="o">**</span><span class="n">moe_load_balancer_config</span><span class="p">)</span>
+            <span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
+                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                    <span class="sa">f</span><span class="s2">&quot;Failed to load MoE load balancer config file: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="si">}</span><span class="s2">&quot;</span>
+                <span class="p">)</span> <span class="kn">from</span><span class="w"> </span><span class="nn">e</span></div>
+
+
+    <span class="c1"># TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig</span>
+<div class="viewcode-block" id="TorchLlmArgs.get_pytorch_backend_config">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config">[docs]</a>
+    <span class="k">def</span><span class="w"> </span><span class="nf">get_pytorch_backend_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PyTorchConfig&quot;</span><span class="p">:</span>
+        <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.pyexecutor.config</span><span class="w"> </span><span class="kn">import</span> <span class="n">PyTorchConfig</span>
+
+        <span class="c1"># TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig</span>
+        <span class="c1"># Just a WAR to support the auto_deploy</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_deploy_config</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">auto_deploy_config</span>
+
+        <span class="k">return</span> <span class="n">PyTorchConfig</span><span class="p">(</span>
+            <span class="n">extra_resource_managers</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">extra_resource_managers</span><span class="p">,</span>
+            <span class="n">use_cuda_graph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">use_cuda_graph</span><span class="p">,</span>
+            <span class="n">cuda_graph_batch_sizes</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="p">,</span>
+            <span class="n">cuda_graph_max_batch_size</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span><span class="p">,</span>
+            <span class="n">cuda_graph_padding_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_padding_enabled</span><span class="p">,</span>
+            <span class="n">disable_overlap_scheduler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">disable_overlap_scheduler</span><span class="p">,</span>
+            <span class="n">moe_max_num_tokens</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_max_num_tokens</span><span class="p">,</span>
+            <span class="n">moe_load_balancer</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_load_balancer</span><span class="p">,</span>
+            <span class="n">attn_backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">attn_backend</span><span class="p">,</span>
+            <span class="n">moe_backend</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">moe_backend</span><span class="p">,</span>
+            <span class="n">mixed_sampler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">mixed_sampler</span><span class="p">,</span>
+            <span class="n">enable_trtllm_sampler</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_trtllm_sampler</span><span class="p">,</span>
+            <span class="n">kv_cache_dtype</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">kv_cache_dtype</span><span class="p">,</span>
+            <span class="n">use_kv_cache</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">use_kv_cache</span><span class="p">,</span>
+            <span class="n">enable_iter_perf_stats</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_iter_perf_stats</span><span class="p">,</span>
+            <span class="n">enable_iter_req_stats</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_iter_req_stats</span><span class="p">,</span>
+            <span class="n">print_iter_log</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">print_iter_log</span><span class="p">,</span>
+            <span class="n">torch_compile_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_enabled</span><span class="p">,</span>
+            <span class="n">torch_compile_fullgraph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_fullgraph</span><span class="p">,</span>
+            <span class="n">torch_compile_inductor_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">torch_compile_inductor_enabled</span><span class="p">,</span>
+            <span class="n">torch_compile_piecewise_cuda_graph</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span>
+            <span class="n">torch_compile_piecewise_cuda_graph</span><span class="p">,</span>
+            <span class="n">torch_compile_enable_userbuffers</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span>
+            <span class="n">torch_compile_enable_userbuffers</span><span class="p">,</span>
+            <span class="n">autotuner_enabled</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">autotuner_enabled</span><span class="p">,</span>
+            <span class="n">enable_layerwise_nvtx_marker</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_layerwise_nvtx_marker</span><span class="p">,</span>
+            <span class="n">load_format</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">load_format</span><span class="p">,</span>
+            <span class="n">enable_min_latency</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">enable_min_latency</span><span class="p">)</span></div>
+
+
+<div class="viewcode-block" id="TorchLlmArgs.validate_cuda_graph_max_batch_size">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_max_batch_size">[docs]</a>
+    <span class="nd">@field_validator</span><span class="p">(</span><span class="s1">&#39;cuda_graph_max_batch_size&#39;</span><span class="p">)</span>
+    <span class="nd">@classmethod</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">validate_cuda_graph_max_batch_size</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="n">v</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Validate cuda_graph_max_batch_size is non-negative.&quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">v</span> <span class="o">&lt;</span> <span class="mi">0</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;cuda_graph_max_batch_size must be non-negative&quot;</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">v</span></div>
+
+
+    <span class="nd">@staticmethod</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">_generate_cuda_graph_batch_sizes</span><span class="p">(</span><span class="n">max_batch_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
+                                         <span class="n">padding_enabled</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Generate a list of batch sizes for CUDA graphs.</span>
+
+<span class="sd">        Args:</span>
+<span class="sd">            max_batch_size: Maximum batch size to generate up to</span>
+<span class="sd">            padding_enabled: Whether padding is enabled, which affects the batch size distribution</span>
+
+<span class="sd">        Returns:</span>
+<span class="sd">            List of batch sizes to create CUDA graphs for</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="n">padding_enabled</span><span class="p">:</span>
+            <span class="n">batch_sizes</span> <span class="o">=</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">4</span><span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="n">i</span> <span class="o">*</span> <span class="mi">8</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">17</span><span class="p">)]</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">batch_sizes</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">32</span><span class="p">))</span> <span class="o">+</span> <span class="p">[</span><span class="mi">32</span><span class="p">,</span> <span class="mi">64</span><span class="p">,</span> <span class="mi">128</span><span class="p">]</span>
+
+        <span class="c1"># Add powers of 2 up to max_batch_size</span>
+        <span class="n">batch_sizes</span> <span class="o">+=</span> <span class="p">[</span>
+            <span class="mi">2</span><span class="o">**</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">8</span><span class="p">,</span> <span class="n">math</span><span class="o">.</span><span class="n">floor</span><span class="p">(</span><span class="n">math</span><span class="o">.</span><span class="n">log</span><span class="p">(</span><span class="n">max_batch_size</span><span class="p">,</span> <span class="mi">2</span><span class="p">)))</span>
+        <span class="p">]</span>
+
+        <span class="c1"># Filter and sort batch sizes</span>
+        <span class="n">batch_sizes</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span>
+            <span class="p">[</span><span class="n">size</span> <span class="k">for</span> <span class="n">size</span> <span class="ow">in</span> <span class="n">batch_sizes</span> <span class="k">if</span> <span class="n">size</span> <span class="o">&lt;=</span> <span class="n">max_batch_size</span><span class="p">])</span>
+
+        <span class="c1"># Add max_batch_size if not already included</span>
+        <span class="k">if</span> <span class="n">max_batch_size</span> <span class="o">!=</span> <span class="n">batch_sizes</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]:</span>
+            <span class="n">batch_sizes</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">max_batch_size</span><span class="p">)</span>
+
+        <span class="k">return</span> <span class="n">batch_sizes</span>
+
+<div class="viewcode-block" id="TorchLlmArgs.validate_cuda_graph_config">
+<a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_config">[docs]</a>
+    <span class="nd">@model_validator</span><span class="p">(</span><span class="n">mode</span><span class="o">=</span><span class="s1">&#39;after&#39;</span><span class="p">)</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">validate_cuda_graph_config</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s1">&#39;TorchLlmArgs&#39;</span><span class="p">:</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;Validate CUDA graph configuration.</span>
+
+<span class="sd">        Ensures that:</span>
+<span class="sd">        1. If cuda_graph_batch_sizes is provided, cuda_graph_max_batch_size must be 0</span>
+<span class="sd">        2. If cuda_graph_batch_sizes is not provided, it is generated based on cuda_graph_max_batch_size</span>
+<span class="sd">        3. If both are provided, cuda_graph_batch_sizes must match the generated values</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="p">)</span>
+            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
+                <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="o">!=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate_cuda_graph_batch_sizes</span><span class="p">(</span>
+                        <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span><span class="p">,</span>
+                        <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_padding_enabled</span><span class="p">):</span>
+                    <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                        <span class="s2">&quot;Please don&#39;t set both cuda_graph_batch_sizes &quot;</span>
+                        <span class="s2">&quot;and cuda_graph_max_batch_size.</span><span class="se">\n</span><span class="s2">&quot;</span>
+                        <span class="sa">f</span><span class="s2">&quot;cuda_graph_batch_sizes: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="si">}</span><span class="s2">, &quot;</span>
+                        <span class="sa">f</span><span class="s2">&quot;cuda_graph_max_batch_size: </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span><span class="si">}</span><span class="s2">&quot;</span>
+                    <span class="p">)</span>
+            <span class="k">else</span><span class="p">:</span>
+                <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span> <span class="o">=</span> <span class="nb">max</span><span class="p">(</span>
+                    <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span><span class="p">)</span>
+        <span class="k">else</span><span class="p">:</span>
+            <span class="n">max_batch_size</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span> <span class="ow">or</span> <span class="mi">128</span>
+            <span class="n">generated_sizes</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_generate_cuda_graph_batch_sizes</span><span class="p">(</span>
+                <span class="n">max_batch_size</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_padding_enabled</span><span class="p">)</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_batch_sizes</span> <span class="o">=</span> <span class="n">generated_sizes</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">cuda_graph_max_batch_size</span> <span class="o">=</span> <span class="n">max_batch_size</span>
+
+        <span class="k">return</span> <span class="bp">self</span></div>
+</div>
+
+
+
 <span class="k">def</span><span class="w"> </span><span class="nf">update_llm_args_with_extra_dict</span><span class="p">(</span>
         <span class="n">llm_args</span><span class="p">:</span> <span class="n">Dict</span><span class="p">,</span>
         <span class="n">llm_args_dict</span><span class="p">:</span> <span class="n">Dict</span><span class="p">,</span>
@@ -2126,6 +2527,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
index 158997e540..22c585bdba 100644
--- a/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
+++ b/latest/_modules/tensorrt_llm/llmapi/mpi_session.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -688,9 +692,6 @@
 <div class="viewcode-block" id="MpiCommSession.__init__">
 <a class="viewcode-back" href="../../../llm-api/reference.html#tensorrt_llm.llmapi.MpiCommSession.__init__">[docs]</a>
     <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">comm</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">n_workers</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">):</span>
-        <span class="k">if</span> <span class="ow">not</span> <span class="n">external_mpi_comm_available</span><span class="p">(</span><span class="n">n_workers</span><span class="p">):</span>
-            <span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s1">&#39;The LLM instance should be launched by mpirun.&#39;</span><span class="p">)</span>
-
         <span class="bp">self</span><span class="o">.</span><span class="n">comm</span> <span class="o">=</span> <span class="n">comm</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">n_workers</span> <span class="o">=</span> <span class="n">n_workers</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">thread_pool</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">ThreadPoolExecutor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
@@ -1147,6 +1148,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/baichuan/model.html b/latest/_modules/tensorrt_llm/models/baichuan/model.html
index 121f134443..d90b92f478 100644
--- a/latest/_modules/tensorrt_llm/models/baichuan/model.html
+++ b/latest/_modules/tensorrt_llm/models/baichuan/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -873,6 +877,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/bert/model.html b/latest/_modules/tensorrt_llm/models/bert/model.html
index 8e7fe4390a..c02ec73408 100644
--- a/latest/_modules/tensorrt_llm/models/bert/model.html
+++ b/latest/_modules/tensorrt_llm/models/bert/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1177,6 +1181,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/bloom/model.html b/latest/_modules/tensorrt_llm/models/bloom/model.html
index dcc93556a6..3f1e04df38 100644
--- a/latest/_modules/tensorrt_llm/models/bloom/model.html
+++ b/latest/_modules/tensorrt_llm/models/bloom/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -785,6 +789,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/chatglm/config.html b/latest/_modules/tensorrt_llm/models/chatglm/config.html
index 4e84f1d005..82d441fa93 100644
--- a/latest/_modules/tensorrt_llm/models/chatglm/config.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -802,6 +806,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/chatglm/model.html b/latest/_modules/tensorrt_llm/models/chatglm/model.html
index bb7c9e4800..079b8e1f18 100644
--- a/latest/_modules/tensorrt_llm/models/chatglm/model.html
+++ b/latest/_modules/tensorrt_llm/models/chatglm/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1001,6 +1005,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/clip/model.html b/latest/_modules/tensorrt_llm/models/clip/model.html
index da2bb70e0c..1a6b4656ce 100644
--- a/latest/_modules/tensorrt_llm/models/clip/model.html
+++ b/latest/_modules/tensorrt_llm/models/clip/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -830,6 +834,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/config.html b/latest/_modules/tensorrt_llm/models/cogvlm/config.html
index d2935fa0bf..725efe1916 100644
--- a/latest/_modules/tensorrt_llm/models/cogvlm/config.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -661,6 +665,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/cogvlm/model.html b/latest/_modules/tensorrt_llm/models/cogvlm/model.html
index 5cf1d41816..5a72b28a35 100644
--- a/latest/_modules/tensorrt_llm/models/cogvlm/model.html
+++ b/latest/_modules/tensorrt_llm/models/cogvlm/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -914,6 +918,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/commandr/model.html b/latest/_modules/tensorrt_llm/models/commandr/model.html
index 2fa81a7940..344cbc789e 100644
--- a/latest/_modules/tensorrt_llm/models/commandr/model.html
+++ b/latest/_modules/tensorrt_llm/models/commandr/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -812,6 +816,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/dbrx/config.html b/latest/_modules/tensorrt_llm/models/dbrx/config.html
index 7927331bb3..914d526e83 100644
--- a/latest/_modules/tensorrt_llm/models/dbrx/config.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -676,6 +680,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/dbrx/model.html b/latest/_modules/tensorrt_llm/models/dbrx/model.html
index cba710ae0d..abecf36478 100644
--- a/latest/_modules/tensorrt_llm/models/dbrx/model.html
+++ b/latest/_modules/tensorrt_llm/models/dbrx/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -802,6 +806,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
index fc6495ebef..e1e4f5805c 100644
--- a/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v1/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -896,6 +900,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
index 6bb2bd45fa..237dedb2d0 100644
--- a/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
+++ b/latest/_modules/tensorrt_llm/models/deepseek_v2/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -978,6 +982,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/dit/model.html b/latest/_modules/tensorrt_llm/models/dit/model.html
index 4fe93289e6..2e62c27bf2 100644
--- a/latest/_modules/tensorrt_llm/models/dit/model.html
+++ b/latest/_modules/tensorrt_llm/models/dit/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1014,6 +1018,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/eagle/model.html b/latest/_modules/tensorrt_llm/models/eagle/model.html
index 4b0c34cf57..8059e80fca 100644
--- a/latest/_modules/tensorrt_llm/models/eagle/model.html
+++ b/latest/_modules/tensorrt_llm/models/eagle/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1950,6 +1954,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/enc_dec/model.html b/latest/_modules/tensorrt_llm/models/enc_dec/model.html
index a9ded9df0d..fac8df8da0 100644
--- a/latest/_modules/tensorrt_llm/models/enc_dec/model.html
+++ b/latest/_modules/tensorrt_llm/models/enc_dec/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -2855,6 +2859,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/falcon/config.html b/latest/_modules/tensorrt_llm/models/falcon/config.html
index 6e615485c3..11d7eeb7ef 100644
--- a/latest/_modules/tensorrt_llm/models/falcon/config.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -737,6 +741,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/falcon/model.html b/latest/_modules/tensorrt_llm/models/falcon/model.html
index 812fc1ee3e..85b1be8036 100644
--- a/latest/_modules/tensorrt_llm/models/falcon/model.html
+++ b/latest/_modules/tensorrt_llm/models/falcon/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -899,6 +903,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/gemma/config.html b/latest/_modules/tensorrt_llm/models/gemma/config.html
index e92d1f837c..961dd54eb6 100644
--- a/latest/_modules/tensorrt_llm/models/gemma/config.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -827,6 +831,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/gemma/model.html b/latest/_modules/tensorrt_llm/models/gemma/model.html
index 09de913de8..e80fa36866 100644
--- a/latest/_modules/tensorrt_llm/models/gemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/gemma/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1019,6 +1023,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/gpt/config.html b/latest/_modules/tensorrt_llm/models/gpt/config.html
index c4d6aaa95f..20156f06cd 100644
--- a/latest/_modules/tensorrt_llm/models/gpt/config.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -946,6 +950,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/gpt/model.html b/latest/_modules/tensorrt_llm/models/gpt/model.html
index db39c0706f..db4d419722 100644
--- a/latest/_modules/tensorrt_llm/models/gpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/gpt/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1055,6 +1059,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/gptj/config.html b/latest/_modules/tensorrt_llm/models/gptj/config.html
index 0e030d97e4..fa05554092 100644
--- a/latest/_modules/tensorrt_llm/models/gptj/config.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -675,6 +679,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/gptj/model.html b/latest/_modules/tensorrt_llm/models/gptj/model.html
index fc4a558791..7082784cc5 100644
--- a/latest/_modules/tensorrt_llm/models/gptj/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptj/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -827,6 +831,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/gptneox/model.html b/latest/_modules/tensorrt_llm/models/gptneox/model.html
index 1f0eac083d..03c4ff698e 100644
--- a/latest/_modules/tensorrt_llm/models/gptneox/model.html
+++ b/latest/_modules/tensorrt_llm/models/gptneox/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -767,6 +771,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/llama/config.html b/latest/_modules/tensorrt_llm/models/llama/config.html
index a1ffd20cef..8ec04ebf96 100644
--- a/latest/_modules/tensorrt_llm/models/llama/config.html
+++ b/latest/_modules/tensorrt_llm/models/llama/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -901,6 +905,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/llama/model.html b/latest/_modules/tensorrt_llm/models/llama/model.html
index f4cb58b7a7..e0986b01f8 100644
--- a/latest/_modules/tensorrt_llm/models/llama/model.html
+++ b/latest/_modules/tensorrt_llm/models/llama/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1249,6 +1253,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/mamba/model.html b/latest/_modules/tensorrt_llm/models/mamba/model.html
index 9cbde2eb0e..0440f62009 100644
--- a/latest/_modules/tensorrt_llm/models/mamba/model.html
+++ b/latest/_modules/tensorrt_llm/models/mamba/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1094,6 +1098,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/medusa/config.html b/latest/_modules/tensorrt_llm/models/medusa/config.html
index 32ef36311e..8c12f3cecb 100644
--- a/latest/_modules/tensorrt_llm/models/medusa/config.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -734,6 +738,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/medusa/model.html b/latest/_modules/tensorrt_llm/models/medusa/model.html
index d488c8a280..bc80024bbc 100644
--- a/latest/_modules/tensorrt_llm/models/medusa/model.html
+++ b/latest/_modules/tensorrt_llm/models/medusa/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -884,6 +888,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/mllama/model.html b/latest/_modules/tensorrt_llm/models/mllama/model.html
index ec693bdaf4..5cbf6a0e7f 100644
--- a/latest/_modules/tensorrt_llm/models/mllama/model.html
+++ b/latest/_modules/tensorrt_llm/models/mllama/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -2195,6 +2199,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
index f9826929aa..ce3902bdff 100644
--- a/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
+++ b/latest/_modules/tensorrt_llm/models/mmdit_sd3/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1261,6 +1265,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/modeling_utils.html b/latest/_modules/tensorrt_llm/models/modeling_utils.html
index edab0f27f1..3ca5f1f7ea 100644
--- a/latest/_modules/tensorrt_llm/models/modeling_utils.html
+++ b/latest/_modules/tensorrt_llm/models/modeling_utils.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -551,7 +555,7 @@
                                    <span class="n">WeightOnlyQuantRowLinear</span><span class="p">)</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..quantization.mode</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">KV_CACHE_QUANT_ALGO_LIST</span><span class="p">,</span> <span class="n">QUANT_ALGO_LIST</span><span class="p">,</span>
                                  <span class="n">W8A8_SQ_PLUGIN_LIST</span><span class="p">,</span> <span class="n">QuantAlgo</span><span class="p">)</span>
-<span class="kn">from</span><span class="w"> </span><span class="nn">..quantization.utils.fp4_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">float4_sf_dtype</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">..quantization.utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">fp4_utils</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..top_model_mixin</span><span class="w"> </span><span class="kn">import</span> <span class="n">TopModelMixin</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">.convert_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">weight_only_quantize_dict</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">.generation_mixin</span><span class="w"> </span><span class="kn">import</span> <span class="n">GenerationMixin</span>
@@ -603,6 +607,7 @@
     <span class="n">LOOKAHEAD_DECODING</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
     <span class="n">EXPLICIT_DRAFT_TOKENS</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
     <span class="n">EAGLE</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
+    <span class="n">NGRAM</span> <span class="o">=</span> <span class="n">auto</span><span class="p">()</span>
 
 <div class="viewcode-block" id="SpeculativeDecodingMode.from_arguments">
 <a class="viewcode-back" href="../../../python-api/tensorrt_llm.models.html#tensorrt_llm.llmapi.SpeculativeDecodingMode.from_arguments">[docs]</a>
@@ -620,6 +625,8 @@
             <span class="k">return</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">EXPLICIT_DRAFT_TOKENS</span>
         <span class="k">elif</span> <span class="n">args</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">==</span> <span class="s2">&quot;eagle&quot;</span><span class="p">:</span>
             <span class="k">return</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">EAGLE</span>
+        <span class="k">elif</span> <span class="n">args</span><span class="o">.</span><span class="n">speculative_decoding_mode</span> <span class="o">==</span> <span class="s2">&quot;ngram&quot;</span><span class="p">:</span>
+            <span class="k">return</span> <span class="n">SpeculativeDecodingMode</span><span class="o">.</span><span class="n">NGRAM</span>
         <span class="k">else</span><span class="p">:</span>
             <span class="k">assert</span> <span class="kc">False</span><span class="p">,</span> <span class="s2">&quot;Unknown speculative_decoding_mode &quot;</span> <span class="o">+</span> <span class="n">args</span><span class="o">.</span><span class="n">speculative_decoding_mode</span></div>
 </div>
@@ -2389,15 +2396,18 @@
         <span class="c1"># Interleave block scale for NVFP4 plugin.</span>
         <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">list</span><span class="p">(</span><span class="n">weights</span><span class="p">):</span>
             <span class="k">if</span> <span class="n">name</span><span class="o">.</span><span class="n">endswith</span><span class="p">(</span><span class="s1">&#39;weights_scaling_factor&#39;</span><span class="p">):</span>
-                <span class="n">ori_shape</span> <span class="o">=</span> <span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">shape</span>
+                <span class="n">out_features</span><span class="p">,</span> <span class="n">in_features</span> <span class="o">=</span> <span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">shape</span>
+                <span class="n">nrows</span> <span class="o">=</span> <span class="n">fp4_utils</span><span class="o">.</span><span class="n">pad_up</span><span class="p">(</span><span class="n">out_features</span><span class="p">,</span> <span class="mi">128</span><span class="p">)</span>
+                <span class="n">ncols</span> <span class="o">=</span> <span class="n">fp4_utils</span><span class="o">.</span><span class="n">pad_up</span><span class="p">(</span><span class="n">in_features</span><span class="p">,</span> <span class="mi">4</span><span class="p">)</span>
                 <span class="n">new_name</span> <span class="o">=</span> <span class="n">name</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">&#39;weights_scaling_factor&#39;</span><span class="p">,</span>
                                         <span class="s1">&#39;weights_block_scaling_factor&#39;</span><span class="p">)</span>
                 <span class="n">weights</span><span class="p">[</span><span class="n">new_name</span><span class="p">]</span> <span class="o">=</span> <span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span>
                 <span class="n">weights</span><span class="p">[</span>
                     <span class="n">new_name</span> <span class="o">+</span>
                     <span class="s2">&quot;_interleaved&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">ops</span><span class="o">.</span><span class="n">tensorrt_llm</span><span class="o">.</span><span class="n">nvfp4_block_scale_interleave</span><span class="p">(</span>
-                        <span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">float4_sf_dtype</span><span class="p">)</span><span class="o">.</span><span class="n">cpu</span><span class="p">()</span><span class="o">.</span><span class="n">contiguous</span><span class="p">(</span>
-                        <span class="p">))</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">ori_shape</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">float4_sf_dtype</span><span class="p">)</span>
+                        <span class="n">weights</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">view</span><span class="p">(</span><span class="n">fp4_utils</span><span class="o">.</span><span class="n">float4_sf_dtype</span><span class="p">)</span><span class="o">.</span><span class="n">cpu</span><span class="p">(</span>
+                        <span class="p">)</span><span class="o">.</span><span class="n">contiguous</span><span class="p">())</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="n">nrows</span><span class="p">,</span> <span class="n">ncols</span><span class="p">)</span><span class="o">.</span><span class="n">view</span><span class="p">(</span>
+                            <span class="n">fp4_utils</span><span class="o">.</span><span class="n">float4_sf_dtype</span><span class="p">)</span>
                 <span class="n">weights</span><span class="o">.</span><span class="n">pop</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
             <span class="k">if</span> <span class="n">name</span><span class="o">.</span><span class="n">endswith</span><span class="p">(</span><span class="s1">&#39;weights_scaling_factor_2&#39;</span><span class="p">):</span>
                 <span class="n">new_name</span> <span class="o">=</span> <span class="n">name</span><span class="o">.</span><span class="n">replace</span><span class="p">(</span><span class="s1">&#39;weights_scaling_factor_2&#39;</span><span class="p">,</span>
@@ -2650,6 +2660,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/mpt/model.html b/latest/_modules/tensorrt_llm/models/mpt/model.html
index fe0ee997ad..3f9a382258 100644
--- a/latest/_modules/tensorrt_llm/models/mpt/model.html
+++ b/latest/_modules/tensorrt_llm/models/mpt/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -799,6 +803,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
index 590c33e21c..7bd34b393a 100644
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/config.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -733,6 +737,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
index da62711161..5fb65b6437 100644
--- a/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
+++ b/latest/_modules/tensorrt_llm/models/multimodal_encoders/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -801,6 +805,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/opt/model.html b/latest/_modules/tensorrt_llm/models/opt/model.html
index 18c335588e..64822e5f83 100644
--- a/latest/_modules/tensorrt_llm/models/opt/model.html
+++ b/latest/_modules/tensorrt_llm/models/opt/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -804,6 +808,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/phi/model.html b/latest/_modules/tensorrt_llm/models/phi/model.html
index 542546f2cb..b5d512d010 100644
--- a/latest/_modules/tensorrt_llm/models/phi/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -848,6 +852,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/phi3/model.html b/latest/_modules/tensorrt_llm/models/phi3/model.html
index 2f45ed23d2..d9f3c283a4 100644
--- a/latest/_modules/tensorrt_llm/models/phi3/model.html
+++ b/latest/_modules/tensorrt_llm/models/phi3/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -944,6 +948,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
index 6ad6b4c9d7..1e6a02647c 100644
--- a/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
+++ b/latest/_modules/tensorrt_llm/models/recurrentgemma/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1247,6 +1251,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/models/redrafter/model.html b/latest/_modules/tensorrt_llm/models/redrafter/model.html
index 82d3430574..ff2ecd37dd 100644
--- a/latest/_modules/tensorrt_llm/models/redrafter/model.html
+++ b/latest/_modules/tensorrt_llm/models/redrafter/model.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -919,6 +923,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/plugin/plugin.html b/latest/_modules/tensorrt_llm/plugin/plugin.html
index 61b6be5a89..ab4d18e833 100644
--- a/latest/_modules/tensorrt_llm/plugin/plugin.html
+++ b/latest/_modules/tensorrt_llm/plugin/plugin.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -533,7 +537,8 @@
 <span class="kn">from</span><span class="w"> </span><span class="nn">.._ipc_utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">IpcMemory</span><span class="p">,</span> <span class="n">can_access_peer</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">.._utils</span><span class="w"> </span><span class="kn">import</span> <span class="n">get_sm_version</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..bindings.internal.runtime</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">lamport_initialize</span><span class="p">,</span>
-                                         <span class="n">lamport_initialize_all</span><span class="p">)</span>
+                                         <span class="n">lamport_initialize_all</span><span class="p">,</span>
+                                         <span class="n">max_workspace_size_lowprecision</span><span class="p">)</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..logger</span><span class="w"> </span><span class="kn">import</span> <span class="n">logger</span>
 <span class="kn">from</span><span class="w"> </span><span class="nn">..mapping</span><span class="w"> </span><span class="kn">import</span> <span class="n">Mapping</span>
 
@@ -1191,7 +1196,7 @@
 <span class="sd">              Then, each instance of allreduce will reference that tensor automatically.</span>
 <span class="sd">    &quot;&quot;&quot;</span>
     <span class="n">POINTERS_PER_RANK</span> <span class="o">=</span> <span class="mi">7</span>
-    <span class="n">POINTERS_OF_COUNTER</span> <span class="o">=</span> <span class="mi">2</span>
+    <span class="n">POINTERS_OF_COUNTER</span> <span class="o">=</span> <span class="mi">3</span>
 
     <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
         <span class="bp">self</span><span class="o">.</span><span class="n">workspace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tensor</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
@@ -1225,6 +1230,17 @@
             <span class="k">return</span> <span class="mi">16_000_000</span>
         <span class="k">return</span> <span class="mi">8_000_000</span>
 
+    <span class="nd">@staticmethod</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">max_workspace_size_lowprecision</span><span class="p">(</span><span class="n">tp_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
+        <span class="k">return</span> <span class="n">max_workspace_size_lowprecision</span><span class="p">(</span><span class="n">tp_size</span><span class="p">)</span>
+
+    <span class="nd">@staticmethod</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">initialize_lowprecision_buffers</span><span class="p">(</span><span class="n">workspace</span><span class="p">:</span> <span class="s2">&quot;torch.tensor&quot;</span><span class="p">,</span>
+                                        <span class="n">tp_size</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
+        <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+        <span class="k">return</span> <span class="n">torch</span><span class="o">.</span><span class="n">ops</span><span class="o">.</span><span class="n">trtllm</span><span class="o">.</span><span class="n">initialize_static_lowprecision_buffers</span><span class="p">(</span>
+            <span class="n">workspace</span><span class="p">,</span> <span class="n">tp_size</span><span class="p">)</span>
+
     <span class="nd">@staticmethod</span>
     <span class="k">def</span><span class="w"> </span><span class="nf">allocate_workspace</span><span class="p">(</span><span class="n">mapping</span><span class="p">:</span> <span class="n">Mapping</span><span class="p">,</span>
                            <span class="n">size</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">IpcMemory</span><span class="p">],</span> <span class="s2">&quot;torch.tensor&quot;</span><span class="p">]:</span>
@@ -1239,11 +1255,11 @@
         <span class="n">ipc_buffers_pong</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span><span class="n">mapping</span><span class="p">,</span> <span class="n">ipc_buffers_size</span><span class="p">,</span>
                                      <span class="n">is_p2p_supported</span><span class="p">)</span>
         <span class="n">ipc_barriers_in</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span>
-            <span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span>
-            <span class="n">is_p2p_supported</span><span class="p">)</span>
+            <span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span> <span class="o">*</span>
+            <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">,</span> <span class="n">is_p2p_supported</span><span class="p">)</span>
         <span class="n">ipc_barriers_out</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span>
-            <span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span>
-            <span class="n">is_p2p_supported</span><span class="p">)</span>
+            <span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span> <span class="o">*</span>
+            <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span><span class="p">,</span> <span class="n">is_p2p_supported</span><span class="p">)</span>
         <span class="n">lamport_buffers_size</span> <span class="o">=</span> <span class="mi">1</span> <span class="k">if</span> <span class="n">force_deterministic</span> <span class="k">else</span> <span class="n">size</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span>
         <span class="n">lamport_buffers_0</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span><span class="n">mapping</span><span class="p">,</span> <span class="n">lamport_buffers_size</span><span class="p">,</span>
                                       <span class="n">is_p2p_supported</span><span class="p">)</span>
@@ -1261,16 +1277,55 @@
                 <span class="n">lamport_buffers_size</span><span class="p">,</span>
             <span class="p">)</span>
         <span class="n">buffers</span> <span class="o">=</span> <span class="p">[</span>
-            <span class="n">ipc_buffers_ping</span><span class="p">,</span> <span class="n">ipc_buffers_pong</span><span class="p">,</span> <span class="n">ipc_barriers_in</span><span class="p">,</span>
-            <span class="n">ipc_barriers_out</span><span class="p">,</span> <span class="n">lamport_buffers_0</span><span class="p">,</span> <span class="n">lamport_buffers_1</span><span class="p">,</span>
-            <span class="n">lamport_buffers_2</span>
+            <span class="n">ipc_buffers_ping</span><span class="p">,</span>
+            <span class="n">ipc_buffers_pong</span><span class="p">,</span>
+            <span class="n">ipc_barriers_in</span><span class="p">,</span>
+            <span class="n">ipc_barriers_out</span><span class="p">,</span>
+            <span class="n">lamport_buffers_0</span><span class="p">,</span>
+            <span class="n">lamport_buffers_1</span><span class="p">,</span>
+            <span class="n">lamport_buffers_2</span><span class="p">,</span>
+            <span class="c1"># Start from 1 since 0 represents released state for barrier at the beginning of the all_reduce.</span>
+            <span class="c1"># The last element is the barrier flag counter.</span>
+            <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cuda&quot;</span><span class="p">)</span>
         <span class="p">]</span>
 
         <span class="k">return</span> <span class="n">buffers</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span>
             <span class="n">ipc_buffers_ping</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">ipc_buffers_pong</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span>
             <span class="n">ipc_barriers_in</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">ipc_barriers_out</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span>
             <span class="n">lamport_buffers_0</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">lamport_buffers_1</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span>
-            <span class="n">lamport_buffers_2</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span> <span class="p">[</span><span class="mi">0</span><span class="p">],</span>
+            <span class="n">lamport_buffers_2</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="p">[</span><span class="n">buffers</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span><span class="o">.</span><span class="n">data_ptr</span><span class="p">()]</span> <span class="o">+</span>
+            <span class="p">[</span><span class="n">buffers</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">][</span><span class="mi">1</span><span class="p">:]</span><span class="o">.</span><span class="n">data_ptr</span><span class="p">()]</span> <span class="o">+</span> <span class="p">[</span><span class="n">buffers</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">][</span><span class="mi">2</span><span class="p">:]</span><span class="o">.</span><span class="n">data_ptr</span><span class="p">()],</span>
+            <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span>
+            <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
+
+    <span class="nd">@staticmethod</span>
+    <span class="k">def</span><span class="w"> </span><span class="nf">allocate_lowprecision_workspace</span><span class="p">(</span>
+            <span class="n">mapping</span><span class="p">:</span> <span class="n">Mapping</span><span class="p">,</span>
+            <span class="n">size</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">IpcMemory</span><span class="p">],</span> <span class="s2">&quot;torch.tensor&quot;</span><span class="p">]:</span>
+        <span class="kn">import</span><span class="w"> </span><span class="nn">torch</span>
+
+        <span class="c1"># Force pull mode and disable lamport when force deterministic is enabled, for reducing device memory usage.</span>
+        <span class="n">is_p2p_supported</span> <span class="o">=</span> <span class="n">can_access_peer</span><span class="p">(</span><span class="n">mapping</span><span class="p">)</span>
+        <span class="n">ipc_buffers_size</span> <span class="o">=</span> <span class="n">size</span>
+        <span class="n">ipc_buffers_ping</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span><span class="n">mapping</span><span class="p">,</span> <span class="n">ipc_buffers_size</span><span class="p">,</span>
+                                     <span class="n">is_p2p_supported</span><span class="p">)</span>
+        <span class="n">ipc_buffers_pong</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span><span class="n">mapping</span><span class="p">,</span> <span class="n">ipc_buffers_size</span><span class="p">,</span>
+                                     <span class="n">is_p2p_supported</span><span class="p">)</span>
+        <span class="n">ipc_barriers_in</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span>
+            <span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span>
+            <span class="n">is_p2p_supported</span><span class="p">)</span>
+        <span class="n">ipc_barriers_out</span> <span class="o">=</span> <span class="n">IpcMemory</span><span class="p">(</span>
+            <span class="n">mapping</span><span class="p">,</span> <span class="n">IpcMemory</span><span class="o">.</span><span class="n">IPC_BARRIERS_SIZE_PER_GPU</span> <span class="o">*</span> <span class="n">mapping</span><span class="o">.</span><span class="n">tp_size</span> <span class="o">*</span> <span class="mi">2</span><span class="p">,</span>
+            <span class="n">is_p2p_supported</span><span class="p">)</span>
+        <span class="n">buffers</span> <span class="o">=</span> <span class="p">[</span>
+            <span class="n">ipc_buffers_ping</span><span class="p">,</span> <span class="n">ipc_buffers_pong</span><span class="p">,</span> <span class="n">ipc_barriers_in</span><span class="p">,</span>
+            <span class="n">ipc_barriers_out</span>
+        <span class="p">]</span>
+
+        <span class="k">return</span> <span class="n">buffers</span><span class="p">,</span> <span class="n">torch</span><span class="o">.</span><span class="n">tensor</span><span class="p">(</span>
+            <span class="n">ipc_buffers_ping</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">ipc_buffers_pong</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span>
+            <span class="n">ipc_barriers_in</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="n">ipc_barriers_out</span><span class="o">.</span><span class="n">serialize</span><span class="p">()</span> <span class="o">+</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">+</span>
+            <span class="p">[</span><span class="mi">0</span><span class="p">],</span>
             <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span>
             <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
 
@@ -1424,6 +1479,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/quantization/mode.html b/latest/_modules/tensorrt_llm/quantization/mode.html
index 18d684332b..1a57fca377 100644
--- a/latest/_modules/tensorrt_llm/quantization/mode.html
+++ b/latest/_modules/tensorrt_llm/quantization/mode.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1027,6 +1031,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html b/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
index 7fbb7ddb31..bffc8235ee 100644
--- a/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
+++ b/latest/_modules/tensorrt_llm/quantization/quantize_by_modelopt.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1888,6 +1892,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html b/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
index 6543130e96..14a8374cb0 100644
--- a/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/enc_dec_model_runner.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1158,6 +1162,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/runtime/generation.html b/latest/_modules/tensorrt_llm/runtime/generation.html
index bb9167849c..f55c97392a 100644
--- a/latest/_modules/tensorrt_llm/runtime/generation.html
+++ b/latest/_modules/tensorrt_llm/runtime/generation.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -5446,6 +5450,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html b/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
index 122753aa16..84659692e3 100644
--- a/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
+++ b/latest/_modules/tensorrt_llm/runtime/kv_cache_manager.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1105,6 +1109,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/runtime/model_runner.html b/latest/_modules/tensorrt_llm/runtime/model_runner.html
index 331b1818aa..2bb6e85224 100644
--- a/latest/_modules/tensorrt_llm/runtime/model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/model_runner.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1617,6 +1621,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html b/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
index b5efa1a9d0..5f129f5cef 100644
--- a/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
+++ b/latest/_modules/tensorrt_llm/runtime/model_runner_cpp.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1010,7 +1014,9 @@
     <span class="nd">@property</span>
     <span class="k">def</span><span class="w"> </span><span class="nf">num_layers</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
         <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">model_config</span><span class="o">.</span><span class="n">num_layers</span><span class="p">(</span>
-            <span class="bp">self</span><span class="o">.</span><span class="n">world_config</span><span class="o">.</span><span class="n">pipeline_parallelism</span><span class="p">)</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">world_config</span><span class="o">.</span><span class="n">pipeline_parallelism</span><span class="p">,</span>
+            <span class="bp">self</span><span class="o">.</span><span class="n">world_config</span><span class="o">.</span><span class="n">pipeline_parallel_rank</span><span class="p">,</span>
+        <span class="p">)</span>
 
     <span class="nd">@property</span>
     <span class="k">def</span><span class="w"> </span><span class="nf">max_sequence_length</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
@@ -1819,6 +1825,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html b/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
index d95fa8d16f..3418ba3496 100644
--- a/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
+++ b/latest/_modules/tensorrt_llm/runtime/multimodal_model_runner.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -3347,6 +3351,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/runtime/session.html b/latest/_modules/tensorrt_llm/runtime/session.html
index 201d6ff0f9..4c54b2be61 100644
--- a/latest/_modules/tensorrt_llm/runtime/session.html
+++ b/latest/_modules/tensorrt_llm/runtime/session.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -965,6 +969,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_modules/tensorrt_llm/sampling_params.html b/latest/_modules/tensorrt_llm/sampling_params.html
index b057c84293..24f7438145 100644
--- a/latest/_modules/tensorrt_llm/sampling_params.html
+++ b/latest/_modules/tensorrt_llm/sampling_params.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -513,6 +517,7 @@
 <span class="kn">from</span><span class="w"> </span><span class="nn">pydantic</span><span class="w"> </span><span class="kn">import</span> <span class="n">BaseModel</span>
 
 <span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.bindings</span><span class="w"> </span><span class="kn">import</span> <span class="n">executor</span> <span class="k">as</span> <span class="n">tllme</span>
+<span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.executor.serialization</span><span class="w"> </span><span class="kn">import</span> <span class="n">register_approved_ipc_class</span>
 
 
 <div class="viewcode-block" id="GuidedDecodingParams">
@@ -579,6 +584,14 @@
 <span class="sd">        &quot;&quot;&quot;</span>
         <span class="k">pass</span>  <span class="c1"># noqa</span>
 
+    <span class="k">def</span><span class="w"> </span><span class="nf">__init_subclass__</span><span class="p">(</span><span class="bp">cls</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+<span class="w">        </span><span class="sd">&quot;&quot;&quot;</span>
+<span class="sd">        This method is called when a class inherits from LogitsProcessor.</span>
+<span class="sd">        &quot;&quot;&quot;</span>
+        <span class="c1"># Register subclass as an approved class for deserialization across IPC boundaries.</span>
+        <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">__init_subclass__</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
+        <span class="n">register_approved_ipc_class</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span>
+
 
 <span class="k">class</span><span class="w"> </span><span class="nc">BatchedLogitsProcessor</span><span class="p">(</span><span class="n">ABC</span><span class="p">):</span>
 <span class="w">    </span><span class="sd">&quot;&quot;&quot;Base class for batched logits processor.</span>
@@ -790,20 +803,18 @@
 <span class="sd">        For instance, while the greedy decoding with n &gt; 1 is capable in the</span>
 <span class="sd">        Executor class of C++ runtime, the LLM API disallows such combination.</span>
 <span class="sd">        &#39;&#39;&#39;</span>
-        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
-            <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">:</span>
-                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                    <span class="sa">f</span><span class="s1">&#39;In beam search, best_of (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s1">) must be &#39;</span>
-                    <span class="sa">f</span><span class="s1">&#39;greater than or equal to n (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s1">).&#39;</span><span class="p">)</span>
+        <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&lt;</span> <span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s2">&quot;best_of (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s2">) cannot be less than n (</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">n</span><span class="si">}</span><span class="s2">)&quot;</span><span class="p">)</span>
 
-            <span class="k">if</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_greedy_decoding</span> <span class="ow">and</span>
-                    <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;TLLM_ALLOW_N_GREEDY_DECODING&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)):</span>
-                <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
-                    <span class="sa">f</span><span class="s1">&#39;Greedy decoding in the LLM API does not allow multiple &#39;</span>
-                    <span class="sa">f</span><span class="s1">&#39;returns. Please set to best_of=1, got best_of=</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s1">. &#39;</span>
-                    <span class="sa">f</span><span class="s1">&#39;Please set to best_of=1 or set an environment variable &#39;</span>
-                    <span class="sa">f</span><span class="s1">&#39;TLLM_ALLOW_N_GREEDY_DECODING=1 to allow best_of &gt; 1 &#39;</span>
-                    <span class="sa">f</span><span class="s1">&#39;under the greedy decoding.&#39;</span><span class="p">)</span>
+        <span class="k">if</span> <span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_greedy_decoding</span>
+                <span class="ow">and</span> <span class="ow">not</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s1">&#39;TLLM_ALLOW_N_GREEDY_DECODING&#39;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)):</span>
+            <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
+                <span class="sa">f</span><span class="s1">&#39;Greedy decoding in the LLM API does not allow multiple &#39;</span>
+                <span class="sa">f</span><span class="s1">&#39;returns. Please set to best_of=1, got best_of=</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="n">best_of</span><span class="si">}</span><span class="s1">. &#39;</span>
+                <span class="sa">f</span><span class="s1">&#39;Please set to best_of=1 or set an environment variable &#39;</span>
+                <span class="sa">f</span><span class="s1">&#39;TLLM_ALLOW_N_GREEDY_DECODING=1 to allow best_of &gt; 1 &#39;</span>
+                <span class="sa">f</span><span class="s1">&#39;under the greedy decoding.&#39;</span><span class="p">)</span>
 
         <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">truncate_prompt_tokens</span> <span class="o">&lt;</span> <span class="mi">1</span><span class="p">:</span>
             <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
@@ -1092,6 +1103,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/_sources/_cpp_gen/executor.rst.txt b/latest/_sources/_cpp_gen/executor.rst.txt
index 08d47843b4..d3ca9cd473 100644
--- a/latest/_sources/_cpp_gen/executor.rst.txt
+++ b/latest/_sources/_cpp_gen/executor.rst.txt
@@ -16,6 +16,12 @@ ________
 .. doxygenfile:: tensor.h
    :project: TensorRT-LLM
 
+transferAgent.h
+_______________
+
+.. doxygenfile:: transferAgent.h
+   :project: TensorRT-LLM
+
 serialization.h
 _______________
 
diff --git a/latest/_sources/advanced/kv-cache-management.md.txt b/latest/_sources/advanced/kv-cache-management.md.txt
new file mode 100644
index 0000000000..f4506d6ee9
--- /dev/null
+++ b/latest/_sources/advanced/kv-cache-management.md.txt
@@ -0,0 +1,75 @@
+(kv-cache-management)=
+
+# KV Cache Management: Pools, Blocks, and Events
+
+This document provides an overview of the internal hierarchy and event system for paged KV cache management, as implemented in the TensorRT-LLM codebase.
+
+For more information on KV cache reuse see [KV cache reuse](kv-cache-reuse.md).
+
+---
+
+## Hierarchy: Pool, Block, and Page
+
+### **Block**
+- **Definition:** The smallest unit of KV cache allocation. A `KVCacheBlock` holds metadata (not the actual data) for a chunk of KV cache.
+- **Purpose:** Each block represents a fixed number of tokens' worth of KV data (can be specified by `tokens_per_block` parameter).
+- **Usage:** Blocks are allocated, reused, or evicted as sequences are processed.
+
+### **Page**
+- **Definition:** In this codebase, "page" is often used interchangeably with "block" (as in "paged KV cache"), but technically, a page could refer to a memory page (hardware-level), while a block is a logical unit for the cache.
+- **In Practice:** The code uses "block" as the main unit; "page" is not a distinct class or struct.
+
+### **Pool**
+- **Definition:** A pool is a contiguous memory buffer (or set of buffers) that holds the actual KV data for one or more layers.
+- **Types:** There are primary pools (fast GPU memory) and secondary pools (slower, e.g., CPU or offload memory).
+- **Organization:** Each pool can serve multiple layers that share the same KV head configuration. Pools are managed by `KVCacheBlockPool` and tracked in vectors in `WindowBlockManager`.
+- **Block ↔ Pool:** Each block is an index into a pool; the pool provides the actual storage, while the block is the metadata handle.
+
+### **WindowBlockManager/BlockManager**
+
+TRT-LLM supports 2 complex features related to KV cache management:
+1. **Variable Group-Query Attention (VGQA)** - i.e. a different `num_kv_heads` value for different layers.
+2. **Variable Sliding Window Attention (VSWA)** - i.e. a different `attention_window_size` value for different layers.
+
+In order to support both of these features, the pool management works as described below.
+
+But in the simple, *most common case*, for most models, where
+1. [MHA/MQA/Non-variable GQA](gpt-attention.md#multi-head-multi-query-and-group-query-attention), i.e., same `num_kv_heads` value for all layers,
+2. Global attention/[SWA](gpt-attention.md#sliding-window-attention-cyclic-rolling-buffer-kv-cache), i.e., same `attention_window_size` value for all layers,
+
+only a *single* pool will be created within the structure described below.
+
+#### KV Cache Pool Management
+
+- **WindowBlockManager:** Manages blocks and pools for a specific attention window size. Within a `WindowBlockManager`, there can be multiple pools - each corresponding a unique number of KV heads - i.e., to support VGQA.
+- **BlockManager:** Manages all `WindowBlockManager` instances, one per unique window size.
+
+**Hierarchy Summary:**
+- **Pool** (memory buffer for KV data)
+  - Contains many blocks.
+- **Blocks** (metadata for a chunk of the pool, each block = `tokens_per_block` tokens)
+    - (Optionally, blocks can be swapped between primary/secondary pools.)
+- **BlockManager/WindowBlockManager**: Manage pools and blocks, handle allocation, reuse, and eviction.
+
+---
+
+## Events in `KVCacheEventManager`
+
+The `KVCacheEventManager` is responsible for tracking and reporting significant changes in the state of the KV cache. Events are used for logging, debugging, or possibly for external monitoring.
+
+### **Types of Events**
+- **Created Event:** When pools or blocks are created/allocated.
+- **Updated Event:** When a block's state changes (e.g., moved between primary/secondary, priority updated).
+- **Removed Event:** When a block is removed from the cache (evicted or released).
+- **Stored Event:** When blocks are stored for potential reuse (e.g., after a sequence finishes and its blocks are reusable).
+
+### **What Triggers an Event?**
+- **Allocation/Deallocation:** Creating or freeing memory pools or blocks.
+- **Eviction/Reuse:** When a block is evicted, reused, or its priority changes.
+- **Block Movement:** When a block is moved between memory levels (primary ↔ secondary).
+- **Block Storage:** When blocks are stored for future reuse (e.g., after a sequence completes).
+
+**In summary:**
+An "event" is any significant change in the lifecycle or state of a KV cache block or pool, tracked for monitoring, debugging, or optimization purposes.
+
+---
diff --git a/latest/_sources/advanced/lowprecision-pcie-allreduce.md.txt b/latest/_sources/advanced/lowprecision-pcie-allreduce.md.txt
new file mode 100644
index 0000000000..57ca754c4e
--- /dev/null
+++ b/latest/_sources/advanced/lowprecision-pcie-allreduce.md.txt
@@ -0,0 +1,65 @@
+# Low-Precision-AllReduce
+
+```{note}
+Note:
+This feature is optimized for PCIe-based GPU topologies and may affect model accuracy. Please evaluate precision impact for your specific workload.
+```
+
+
+TRT-LLM supports `low-precision-allreduce`, a communication optimization that accelerates AllReduce operations in PCIe-based GPU environments. This feature quantizes FP16/BF16 data to FP8 during network transmission, reducing communication volume and improving performance.
+
+## Algorithm
+
+The Low-Precision-AllReduce algorithm works by:
+1. Quantizing input FP16/BF16 tensors to FP8 format before network transmission
+
+
+   **Quantization details**: We use a "per-warp" quantization approach where each CUDA warp (32 threads) processes a batch of data. In each warp, 31 threads quantize FP16/BF16 values to FP8 e4m3 format (16 bytes per thread), while the last thread transmits a scalar value. This results in each warp collectively quantizing 496 elements plus one scalar at a time.
+
+2. Transmitting the quantized data through the network
+3. Dequantizing received data back to the original precision
+4. Performing the reduction operation
+
+In 8-GPU scenarios, this approach shifts the communication bottleneck from cross-NUMA QPI to the PCIe switch, resulting in better overall performance.
+
+## Topology Requirements
+
+![8x L20/L40s Node Architecture](images/8x_l20_L40S_node_architecture.png)
+
+Low-Precision-AllReduce is specifically designed for the topology shown above, where:
+- Each node contains 2 NUMA domains
+- Each NUMA domain has 4 GPUs connected via PCIe switch
+- GPUs within the same NUMA node communicate via the PCIe switch
+
+**Important:** This optimization will not accelerate performance in different topologies (e.g., where each GPU is in a separate NUMA domain).
+
+## Usage
+
+The Low-Precision-AllReduce algorithm can be enabled in two ways:
+
+1. **Direct specification** in your code:
+```
+AllReduce allreduce(mapping=mapping, strategy=AllReduceStrategy.LOWPRECISION);
+```
+2. **Environment variable control** with AUTO strategy:
+```
+// In your code
+AllReduce allreduce(mapping=mapping, strategy=AllReduceStrategy.AUTO);
+// Set environment variable before running
+export FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY=1
+```
+
+## Performance and Accuracy Considerations
+
+Low-Precision-AllReduce reduces communication volume by using FP8 data format for transmission. This optimization:
+- Improves performance for large message sizes in PCIe-based topologies
+- May slightly reduce numerical precision
+- Automatically falls back to other strategies when no performance benefit is expected (e.g., with NVLink or small messages)
+
+Users should evaluate the precision impact on their specific models and workloads.
+
+## Environment Variables
+
+- `FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY`: When set to `1`, forces the use of low-precision algorithm with AUTO strategy. If the algorithm determines it cannot provide performance benefits, it will automatically fall back to other strategies.
+
+**Note**: When compiling TensorRT-LLM without enabling the `ENABLE_FP8` option, setting Low Precision allreduce will not take effect.
diff --git a/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt b/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt
index 7f90c391c0..d510209b4a 100644
--- a/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt
+++ b/latest/_sources/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md.txt
@@ -134,9 +134,8 @@ To do the benchmark, run the following command:
 YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    moe_backend: TRTLLM
+use_cuda_graph: true
+moe_backend: TRTLLM
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -202,21 +201,20 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
 YOUR_DATA_PATH=./dataset.txt
 
 cat >./extra-llm-api-config.yml <<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    cuda_graph_padding_enabled: true
-    cuda_graph_batch_sizes:
-    - 1
-    - 2
-    - 4
-    - 8
-    - 16
-    - 32
-    - 64
-    - 128
-    - 256
-    - 384
-    print_iter_log: true
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
+- 1
+- 2
+- 4
+- 8
+- 16
+- 32
+- 64
+- 128
+- 256
+- 384
+print_iter_log: true
 enable_attention_dp: true
 EOF
 
@@ -257,8 +255,7 @@ To do the benchmark, run the following command:
 YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
+use_cuda_graph: true
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -307,10 +304,9 @@ python ${YOUR_WORK_PATH}/benchmarks/cpp/prepare_dataset.py \
 YOUR_DATA_PATH=./dataset.txt
 
 cat >./extra-llm-api-config.yml<<EOF
-pytorch_backend_config:
-    use_cuda_graph: true
-    cuda_graph_batch_sizes:
-    - 128
+use_cuda_graph: true
+cuda_graph_batch_sizes:
+- 128
 enable_attention_dp: true
 EOF
 
diff --git a/latest/_sources/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md.txt b/latest/_sources/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md.txt
index 6a19c021e6..b43b8ed004 100644
--- a/latest/_sources/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md.txt
+++ b/latest/_sources/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md.txt
@@ -50,7 +50,7 @@ Output Sequence Length (OSL): 2k tokens
 ### Model Architecture
 The base DeepSeek-R1 main model contains: 3x dense layers (initial) and 58x MoE layers, there is also 1x Multi-Tokens Prediction (MTP) layer (MoE-architecture equivalent) for speculative decoding.  Our optimized configuration extends the MTP layer to 3x layers using autoregressive styling for peak performance exploration.
 
-<img src="../media/tech_blog1_model_overview.png?raw=true" alt="tech_blog1_model_overview" width="500" height="auto">
+<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_model_overview.png?raw=true" alt="tech_blog1_model_overview" width="500" height="auto">
 
 ### Precision Strategy
 We have explored a mixed precision recipe, which provides a better tradeoff between accuracy and performance.
@@ -84,7 +84,7 @@ We have also explored and introduced mixed parallel strategy on 8xB200 GPUs. Spe
 ### Everything in One Diagram
 Now let's put everything into one diagram, which represents a MoE layer from a decoding iteration.
 
-<img src="../media/tech_blog1_model_details.png?raw=true" alt="tech_blog1_model_details" width="1600" height="auto">
+<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_model_details.png?raw=true" alt="tech_blog1_model_details" width="1600" height="auto">
 
 
 The modules in the diagram are:
@@ -136,7 +136,7 @@ The modules in the diagram are:
 | Optimize CUTLASS Flow: Sparse Experts as GEMMs            |   249    | The code is not open-source yet due to the dependency with internal base environment and we are planning to make it decoupled from internal base environment thus to be able to open-source in the future.|
 | Introduce EP4TP2 for better workload balance              |   253    | Use `--tp 8 --ep 4` when benchmarking                                                                                                                       |
 | Introduce moe_backend=TRTLLM, EP2TP4 for better balance   |   299    | [PR #4280](https://github.com/NVIDIA/TensorRT-LLM/pull/4280)                                                                                          |
-| Optimize Fuse_A_GEMM and Router_GEMM                      |   340    | WIP: [PR #4115](https://github.com/NVIDIA/TensorRT-LLM/pull/4115)                                                                                          |
+| Optimize Fuse_A_GEMM and Router_GEMM                      |   340    | WIP                                                                                          |
 | Relax Acceptance                                          |   **368**    | [deepseek_v3#multi-token-prediction-mtp](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#multi-token-prediction-mtp)     |
 
 ### System Level optimizations
@@ -195,7 +195,7 @@ We have introduced multi-streams based optimizations to hide some kernels' overh
 
 #### Sparse Experts as GEMMs (only works when moe_backend=CUTLASS)
 
-<img src="../media/tech_blog1_sparse_exp_as_a_gemm.png?raw=true" alt="tech_blog1_sparse_exp_as_a_gemm" width="800" height="auto">
+<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_sparse_exp_as_a_gemm.png?raw=true" alt="tech_blog1_sparse_exp_as_a_gemm" width="800" height="auto">
 
 The existing CUTLASS-based Sparse Experts flow (illustrated in the figure) dispatches input tokens to their designated experts, then applies indexed local reduction on each expert's outputs before a global allreduce. Both dispatching and indexed local reduction incur high overhead in low-latency scenarios. To address this, we propose treating "Sparse Experts as GEMMs" by sending all tokens to each activated expert and masking out unneeded outputs before local reduction. Because grouped GEMMs are memory-bound, the extra computations from redundant tokens have minimal impact, effectively eliminating the costly dispatch and reduction overhead.
 
@@ -229,12 +229,12 @@ We focus on optimizing two kinds of dense GEMMs: Fuse_A_GEMM and RouterGEMM, bec
 ##### Fuse_A_GEMM
 We developed a custom Fuse_A_GEMM that prefetches the majority of its weights into shared memory (enabled by PDL and overlapped with oneshot-AllReduce), significantly enhancing performance. The kernel shows substantial improvements over default GEMM implementation when num_tokens < 16.
 
-<img src="../media/tech_blog1_fuse_a_gemm.png?raw=true" alt="tech_blog1_fuse_a_gemm" width="500" height="auto">
+<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_fuse_a_gemm.png?raw=true" alt="tech_blog1_fuse_a_gemm" width="500" height="auto">
 
 ##### RouterGEMM
-By leveraging our internal AI code generator, we automatically generate an optimized RouterGEMM kernel, which delivers substantial improvements over the default GEMM implementation when [num_tokens <=30](https://github.com/NVIDIA/TensorRT-LLM/pull/4115/files#diff-006ae982200a5ef2b27f4aedb526025e64406d3c2fadde329ea745793fac04edR303:~:text=and%20hidden_states.-,size,-(0))
+By leveraging our internal AI code generator, we automatically generate an optimized RouterGEMM kernel, which delivers substantial improvements over the default GEMM implementation when num_tokens <=30.
 
-<img src="../media/tech_blog1_router_gemm.png?raw=true" alt="tech_blog1_router_gemm" width="500" height="auto">
+<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_router_gemm.png?raw=true" alt="tech_blog1_router_gemm" width="500" height="auto">
 
 #### Kernel fusion
 Kernel fusion is necessary for min-latency scenario to reduce extra global memory write/read cost, and we support following fusion patterns now
diff --git a/latest/_sources/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md.txt b/latest/_sources/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md.txt
new file mode 100644
index 0000000000..0014f1c7f2
--- /dev/null
+++ b/latest/_sources/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md.txt
@@ -0,0 +1,252 @@
+# DeepSeek R1 MTP Implementation and Optimization
+by NVIDIA TensorRT-LLM team
+## Table of Contents
+- [MTP for inference](#mtp-for-inference)
+  - [Background](#background)
+  - [MTP Vanilla](#mtp-vanilla)
+  - [MTP Eagle](#mtp-eagle)
+- [MTP implementation in TensorRT-LLM](#mtp-implementation-in-tensorrt-llm)
+  - [Basic Implementation](#basic-implementation)
+  - [MTP Modules](#mtp-modules)
+  - [Attention for MTP](#attention-for-mtp)
+  - [How to run DeepSeek models with MTP](#how-to-run-deepseek-models-with-mtp)
+- [MTP optimization - Relaxed Acceptance](#mtp-optimization---relaxed-acceptance)
+  - [Relaxed Acceptance](#relaxed-acceptance)
+  - [How to run the DeepSeek-R1 model with Relaxed Acceptance](#how-to-run-the-deepseek-r1-model-with-relaxed-acceptance)
+- [Evaluation](#evaluation)
+  - [Achieving speedup with MTP speculative decoding](#achieving-speedup-with-mtp-speculative-decoding)
+  - [Accuracy studies for Relaxed Acceptance](#accuracy-studies-for-relaxed-acceptance)
+- [Future Works](#future-works)
+  - [Tree-based speculative decoding support](#tree-based-speculative-decoding-support)
+  - [Eagle3 support](#eagle3-support)
+  - [Fix known issues](#fix-known-issues)
+- [Acknowledgment](#acknowledgment)
+
+
+TensorRT-LLM achieves world-record inference performance for DeepSeek-R1 on NVIDIA Blackwell GPUs, where Multi-Token Prediction (MTP) delivers a significant speedup. In our [previous blog post](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md), we discussed the key optimizations that enable the outstanding inference latency of the DeepSeek-R1 model. This article dives deeper into the implementation and optimization of MTP in TensorRT-LLM.
+
+## MTP for inference
+Inspired by a previous [research work](https://arxiv.org/pdf/2404.19737), MTP is designed to help the DeepSeek-V3 training. It adds additional MTP modules at the end of the main model and uses them to predict additional tokens. In this way, MTP can extend the prediction scope to multiple future tokens at each position to achieve better model accuracy. During inference, those MTP modules can also be used for speculative decoding to improve the generation latency further. In this section, we will introduce the MTP speculative decoding algorithm for LLM inference.
+
+### Background
+Speculative decoding is a popular technique for faster and cost-effective LLM inference. It’s based on the premise that generating multiple future tokens(especially for decode phase which is less compute bound) is more efficient than processing a single token. Speculative decoding techniques usually divide the process into a low-cost draft stage and a parallelized verification stage. The draft stage predicts draft tokens by using a small model or a subset of layers in the main model. And the verification stage uses the main model to determine how many of these draft tokens to accept, which is far more efficient than generating one token per iteration.
+
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_verify_and_accept.png" alt="tech_blog2_verify_and_accept" width="1280" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 1. Verification example</em></sub></p>
+
+Figure 1 shows an example of how to verify and accept those draft tokens. Assuming there are a total of 5 draft tokens “ABCDE”, we will extend them to the input token “G”, and input a total of 6 tokens to the main model. After sampling, we can get six different expected tokens, then compare the expected tokens with the draft tokens and accept the longest prefix matched tokens. In this example, the tokens “ABC” are matched. Because “H” is predicted by the main model and the corresponding input token “C” is already accepted, “H” will also be accepted. In this way, we can accept four tokens in a single iteration. MTP also uses this method to verify and accept draft tokens.
+For the draft stage in MTP, there are two different MTP methods, MTP vanilla and MTP eagle. They can be used for different inference cases.
+
+### MTP Vanilla
+
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_mtp_vanilla.png" alt="tech_blog2_mtp_vanilla" width="640" height="auto">
+</figure>
+</div>
+<p align="left"><sub><em>Figure 2. MTP Vanilla, where t<sub>i</sub> is the input token, d<sub>i</sub> is the predicted draft token, K is the number of MTP modules, and h<sub>i</sub><sup>n</sup> is the hidden state of the n-th MTP module. Note that h<sub>0</sub> means the hidden states of the main model.  (Disclaimer: the figures adapted from the original DeepSeek V3 tech report)</em></sub></p>
+
+
+MTP Vanilla method is more similar to the MTP training, and it sequentially uses different MTP modules to predict multiple draft tokens. This method can support model checkpoints with weights of multiple different MTP modules. And each MTP module will have its own KV cache.
+
+Figure 2 illustrates the MTP vanilla inference. In the context phase, assuming there are a total of four input tokens, we will get the output token $t_5$ and the hidden states after the main model forward. The output token will be appended to the input tokens, then we shift out the first token to get tokens from $t_2$ to $t_5$ as the input tokens of the first MTP module. The hidden states from the main model will be directly used as the input of the first MTP module to predict the first draft token. For the next several MTP modules, we will use the same method to prepare the inputs to predict the sequential draft tokens.
+
+In the generation phase, there will be a little difference. The predicted token $t_5$ and the draft tokens will be used as inputs for the main model. After the main model forward, we will do the verification to get the accepted tokens. In this example, assuming $j$ draft tokens $d_6$~$d_{j+5}$ are accepted. Then prepare the MTP module inputs.  Different from the context phase, we will prepare input IDs and hidden states of a total of $K$ tokens before the last accepted token. In this example, the last accepted token is $t_{j+6}$. Then we can get the first draft token after the first MTP module forward. For the sequential MTP modules, we can prepare their inputs in a similar way to the MTP modules in the context phase, so all of those MTP modules have the same input sequence length. After predicting all of the draft tokens, we need to evict the keys/values of those rejected draft tokens from the main model's KV cache to ensure the subsequent calculation is correct.
+
+### MTP Eagle
+
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_mtp_eagle.png" alt="tech_blog2_mtp_eagle" width="640" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 3. MTP Eagle, using the same notation as Figure 2</em></sub></p>
+
+MTP Eagle can be viewed as a variant of [Eagle](https://arxiv.org/pdf/2401.15077) speculative decoding method, but only supports chain decoding now. It reuses the same MTP module and repeats multiple times to predict draft tokens. MTP Eagle supports the model checkpoint with only one MTP module. The official DeepSeek-V3 and DeepSeek-R1 have only one MTP module in their checkpoints. Another difference with MTP vanilla is the KV cache. In the MTP Eagle method, the MTP module reuses the same KV cache when predicting multiple draft tokens.
+
+Figure 3 gives an MTP Eagle example. In the context phase, the inputs of the first MTP module forward are the same as the MTP Vanilla. However, for the sequential MTP module forward, the first difference is that MTP Eagle uses the same MTP module to predict draft tokens and reuses the same KV cache. Another difference is that we only need to input the token ID and the hidden state of one token. The token is the last predicted draft token, while the hidden state is the corresponding hidden state in the last MTP module forward. In this way, we can predict total K draft tokens by using only one MTP module.
+
+In the generation phase, the verification stage is the same as MTP Vanilla. After getting the accepted tokens, we will use the last accepted tokens and the corresponding hidden state as the inputs of the first MTP module forward. Compared with MTP Vanilla, it will be much easier to implement. And the sequential MTP module forwards use the same method as the context phase to prepare inputs. After predicting all of the draft tokens, we need to evict the keys/values of those rejected draft tokens from the main model's KV cache.
+
+## MTP implementation in TensorRT-LLM
+### Basic Implementation
+TensorRT-LLM has two different paths for MTP, one for [MTP Vanilla](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047) and another for [MTP Eagle](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047). MTP Eagle is the default path for DeepSeek-V3 and DeepSeek-R1 models.
+
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_overall_workflow.png" alt="tech_blog2_overall_workflow" width="800" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 4. MTP workflow in TensorRT-LLM</em></sub></p>
+
+Figure 4 shows the overall workflow of MTP in TensorRT-LLM. Both paths share the runtime workflow, and the differences are in the MTP modules forward. In the context phase, there is no draft token in the inputs. TensorRT-LLM model engine fetches the input IDs from the requests and inputs to the model engine forward to get the next token and the hidden state. Then we prepare the MTP module inputs, and the MTP modules forward the inputs to predict the draft tokens.
+
+The generation workflow is more complicated. We need to do both the verification and draft stages. The predicted new token and draft tokens are the inputs for the main model. After the main model forward, we can sample from the output logits and get the following new tokens. Then compare them with the input draft tokens to get the final accepted tokens. The verification stage will be finished here. We will use the accepted tokens and hidden states to start a new draft stage, which uses the MTP layers to predict new draft tokens for the next iteration. Finally, we need to rewind the KV cache to evict keys/values corresponding to those rejected tokens.
+
+Except for the Rewind KV Cache, all of those processes are inside the model engine forward function. In this way, we can use one model engine to support MTP inference, and it would be easier for MTP to be compatible with other features, such as CUDA graph and overlap scheduler. When enabling CUDA graph, both the verification and draft stages can be captured in one graph, significantly reducing CPU overhead.
+
+### MTP Modules
+
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_mtp_modules.png" alt="tech_blog2_mtp_modules" width="640" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 5. MTP model architecture</em></sub></p>
+
+Figure 5 introduces the basic model architecture of [MTP Vanilla](https://github.com/NVIDIA/TensorRT-LLM/blob/338744fba6a91147b739b7f02d19b37bc19aa17a/tensorrt_llm/_torch/speculative/mtp.py#L326), [MTP Eagle](https://github.com/NVIDIA/TensorRT-LLM/blob/338744fba6a91147b739b7f02d19b37bc19aa17a/tensorrt_llm/_torch/speculative/mtp.py#L1047), and the basic [MTP module](https://github.com/NVIDIA/TensorRT-LLM/blob/338744fba6a91147b739b7f02d19b37bc19aa17a/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L829) design. Because MTP vanilla needs $K$ input tokens, if the number of accepted tokens is less than the number of input tokens, i.e. $j<K$, we need to use the old token IDs and hidden states as the input of the first MTP module. To avoid bringing much additional computation overhead, we add two tensors for each request to save the past $K$ input IDs and the hidden states of past $K$ tokens, and update them by using the accepted tokens and corresponding hidden states each iteration. In this way, we can read these tensors when preparing inputs for the first MTP module. MTP Eagle implementation is much easier and straightforward, just call the same MTP module forward $K$ times to get $K$ new draft tokens.
+
+The MTP module follows the design in DeepSeek-V3. The embedding layer and output head in MTP modules are shared with the main model, which can save GPU memory consumption.
+
+
+### Attention for MTP
+
+Attention is also a very important component in supporting MTP inference. The changes are mainly in the attention kernels for the generation phase. For the normal request, there will be only one input token in the generation phase, but for MTP, there will be $K+1$ input tokens. Since MTP sequentially predicts additional tokens, the predicted draft tokens are chained. Though we have an MTP Eagle path, currently, we only have the chain-based support for MTP Eagle. So, a causal mask is enough for the attention kernel to support MTP. In our implementation, TensorRT-LLM will use the fp8 flashMLA generation kernel on Hopper GPU, while using TRTLLM customized attention kernels on Blackwell for better performance.
+
+### How to run DeepSeek models with MTP
+Run DeepSeek-V3/R1 models with MTP, use [examples/pytorch/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/pytorch/quickstart_advanced.py) with additional options:
+
+```bash
+cd examples/pytorch
+python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N
+```
+
+To benchmark min-latency performance with MTP, you need to follow [this document](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/models/core/deepseek_v3/README.md#6-dataset-preparation) to prepare your dataset, then follow the steps below:
+
+```bash
+YOUR_DATA_PATH=<your dataset file following the format>
+
+cat >./extra-llm-api-config.yml<<EOF
+use_cuda_graph: true
+moe_backend: TRTLLM
+speculative_config:
+    decoding_type: MTP
+    num_nextn_predict_layers: 3
+EOF
+
+export TRTLLM_ENABLE_PDL=1
+
+trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
+    throughput \
+    --dataset $YOUR_DATA_PATH \
+    --backend pytorch \
+    --num_requests 10 \
+    --concurrency 1 \
+    --max_batch_size 1 \
+    --tp 8 \
+    --ep 2 \
+    --extra_llm_api_options ./extra-llm-api-config.yml
+```
+
+## MTP optimization - Relaxed Acceptance
+DeepSeek-R1 is a reasoning model that first outputs some thinking tokens, after which the user can get the actual outputs. The thinking process usually takes up a lot of tokens, and the quality of the outputs of the thinking process may have a limited impact on the final answer. So we want to use a more aggressive acceptance strategy, called [relaxed acceptance](https://github.com/NVIDIA/TensorRT-LLM/pull/3865), for the thinking process to speed up the thinking decoding phase. This will be a tradeoff between speedup and output quality. From the experimental results, the impact of relaxed acceptance on output quality is limited.
+
+### Relaxed Acceptance
+
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_relaxed_acceptance.png" alt="tech_blog2_relaxed_acceptance" width="1024" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 6. Relaxed Acceptance example. Use MTP nextn=4 and top-3 in this example.</em></sub></p>
+
+In previous verification and acceptance, we will use a top-1 to sample from the logits the main model to get the “expected” tokens as shown in Figure 1. There will be only one choice to compare with the draft tokens, which we call “Strict Acceptance”.
+
+As for the Relaxed Acceptance, we first get the top-N tokens sampled from the logits, so more candidates will be compared with the input draft tokens. To make sure the accepted tokens are as accurate as possible, we also added a probability threshold, i.e., delta. We can get the token probabilities by applying a softmax to the logits. After getting the top-N tokens, we will remove tokens from the candidate list if their probability is smaller than the (top-1 probability - delta). In this way, we may get more than one token candidate, and all of those tokens are with a high probability. Then we can compare the input draft tokens with those candidates. If one of them matches, we can accept this draft token, so the acceptance rate will be increased. Figure 6 shows an example of a comparison between Strict Acceptance and Relaxed Acceptance.
+
+Note that the Relaxed Acceptance will only be used during the thinking phase, while the Strict Acceptance will still be used during the non-thinking phase. And the Relaxed Acceptance only supports the DeepSeek-R1 model now.
+
+
+### How to run the DeepSeek-R1 model with Relaxed Acceptance
+
+Run DeepSeek-R1 models with MTP Relaxed Acceptance, use [examples/pytorch/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/pytorch/quickstart_advanced.py) with additional options:
+
+```bash
+cd examples/pytorch
+python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N --use_relaxed_acceptance_for_thinking --relaxed_topk 10 --relaxed_delta 0.6
+```
+
+To benchmark min-latency performance with MTP Relaxed Acceptance, you need to follow [this document](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/models/core/deepseek_v3/README.md#6-dataset-preparation) to prepare your dataset, then follow the steps below:
+
+```bash
+YOUR_DATA_PATH=<your dataset file following the format>
+
+cat >./extra-llm-api-config.yml<<EOF
+use_cuda_graph: true
+moe_backend: TRTLLM
+speculative_config:
+    decoding_type: MTP
+    num_nextn_predict_layers: 3
+    use_relaxed_acceptance_for_thinking: true
+    relaxed_topk: 10
+    relaxed_delta: 0.6
+EOF
+
+export TRTLLM_ENABLE_PDL=1
+
+trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
+    throughput \
+    --dataset $YOUR_DATA_PATH \
+    --backend pytorch \
+    --num_requests 10 \
+    --concurrency 1 \
+    --max_batch_size 1 \
+    --tp 8 \
+    --ep 2 \
+    --extra_llm_api_options ./extra-llm-api-config.yml
+```
+
+## Evaluation
+### Achieving speedup with MTP speculative decoding
+
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_perf_and_ar.png" alt="tech_blog2_perf_and_ar" width="1280" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 7. DeepSeek-R1-FP4 671B min-latency performance with different MTP next-n</em></sub></p>
+
+We tested the min-latency (batch size = 1) performance of the DeepSeek-R1-FP4 model with different MTP next-n on a B200 node. The MLA runs with TP=8, and the MoE runs with EP=2. And there are ten different requests with ISL/OSL=1K/2K. From Figure 7, we can see that MTP=3 can help get the best min-latency performance on 8 B200 GPUs, which can bring 2.16x speedup compared with the baseline nextn=0. And with the help of the relaxed acceptance, the min-latency performance can be further improved to achieve a 2.33x speedup. We also evaluated the CUDA graph and overlap scheduler benefits. For such a min-latency case, CUDA graph can achieve a 7.22x average speedup, while the overlap scheduler can achieve 1.03x average latency.
+
+### Accuracy studies for Relaxed Acceptance
+
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_acc_relaxed_acceptance.png" alt="tech_blog2_acc_relaxed_acceptance" width="800" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 8. Ablation results for the Relaxed Acceptance. Using MTP nextn=3, top-10, and delta=0.6.</em></sub></p>
+
+We validated the Relaxed Acceptance on different datasets. In Figure 8, we show the ablation results for Relaxed Acceptance by using the DeepSeek-R1-FP4 model. Compared with Strict Acceptance, the impact of Relaxed Acceptance on output quality is limited, resulting in only a slight accuracy drop.
+
+## Future Works
+### Tree-based speculative decoding support
+
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_tree_spec_decoding.png" alt="tech_blog2_tree_spec_decoding" width="800" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 9. Comparison between the chain-based and tree-based speculative decoding</em></sub></p>
+
+TensorRT-LLM PyTorch backend can only support chain-based speculative decoding now, both MTP Vanilla and MTP Eagle. However, the tree-based speculative decoding technique is widely used in previous advanced methods, such as Ealge2 and Eagle3, to increase the acceptance rate. MTPs in TensorRT-LLM can also be extended to support the tree-based technique. Figure 9 compares the chain-based method with the tree-based method. Both full tree and dynamic tree methods can help expand the candidate combinations, so that we can have more choices for the draft tokens.
+
+### Eagle3 support
+
+Another important method is Eagle3. From the [Eagle3 paper](https://arxiv.org/pdf/2503.01840), the promising results show that it can help greatly increase the acceptance rate by leveraging different levels’ hidden states to predict draft tokens. Since TensorRT-LLM already has [Eagle-3 support](https://github.com/NVIDIA/TensorRT-LLM/pull/3035) now, in the future, we also want to train an Eagle3 head to support DeepSeek-V3/R1+Eagle3 to achieve better speedup.
+
+### Fix known issues
+
+There are still some known issues, and we will fix them soon:
+- The MTP vanilla path has a known accuracy issue. We will fix it and refactor the MTP vanilla implementation.
+- The MTP Eagle is non-deterministic now.
+- An accuracy issue when enabling MTP and attention DP together.
+
+
+## Acknowledgment
+
+This was a remarkable cross-team effort to support and optimize MTP in TensorRT-LLM. We would like to extend our gratitude to everyone who contributed to making this possible, as it involved a typical system/algorithm co-design approach spanning multiple technical layers—including kernel optimization, runtime enhancements, algorithmic improvements, and performance measurement & analysis. And a special thanks goes to the DeepSeek team for developing the MTP method, which lays down the foundation of this blog.
diff --git a/latest/_sources/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md.txt b/latest/_sources/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md.txt
new file mode 100644
index 0000000000..15b418f9b5
--- /dev/null
+++ b/latest/_sources/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md.txt
@@ -0,0 +1,174 @@
+# Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers
+
+By NVIDIA TensorRT-LLM team
+## Table of Contents
+  - [Introduction](#introduction)
+  - [Precision strategy](#precision-strategy)
+  - [Parallel strategy](#parallel-strategy)
+    - [Weights absorb and MQA](#weights-absorb-and-mqa)
+    - [Data Parallel for Attention module (ADP)](#data-parallel-for-attention-module-adp)
+    - [Expert parallel for MoE (EP)](#expert-parallel-for-moe-ep)
+  - [MLA Layers Optimizations](#mla-layers-optimizations)
+  - [MoE Layers Optimizations](#moe-layers-optimizations)
+  - [Runtime Optimizations](#runtime-optimizations)
+  - [How to reproduce](#how-to-reproduce)
+  - [Future Works](#future-works)
+  - [Acknowledgment](#acknowledgment)
+
+## Introduction
+The open source DeepSeek R1 model's innovative architecture including the multi-head latent attention (MLA) and large sparse Mixture-of-Experts (MoE) significantly improved the inference efficiency of the LLM models. However, harnessing the full potential of such an innovative structure requires equally important hardware/software co-optimization. This post delves into the optimization strategies for DeepSeek R1 throughput oriented scenarios (TPS/GPU), developed by NVIDIA within TensorRT-LLM on NVIDIA's Blackwell B200 GPUs. We will explore the rationale behind each enhancement. [The other min-latency optimization blog](./blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) explained in detail how TensorRT-LLM optimizes the R1 performance to achieve the best of the TPS/USER.
+
+These optimizations have significantly boosted DeepSeek R1 throughput on Blackwell. Performance increased from approximately 2000 TPS/GPU in February to 4600 TPS/GPU on ISL/OSL 1K/2K dataset. The optimizations are general and applicable to other ISL/OSL configs too. These optimization items were broadly categorized into three areas: MLA layers, MoE layers, and runtime.
+
+## Precision strategy
+
+The mixed precision recipe for DeepSeek R1 throughput scenario is almost the same as [what](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md#precision-strategy) is used for latency oriented scenario, with the following differences:
+
+* FP8 KV cache and FP8 attention, rather than BF16 precision.
+* FP4 Allgather for better communication bandwidth utilization.
+
+The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT-LLM implementations are:
+
+| Precision | GPQA Diamond | MATH-500
+| :-- | :-- | :-- |
+| TensorRT-LLM FP8 | 0.697	| 0.954 |
+| TensorRT-LLM FP4 | 0.705	| 0.96 |
+
+** Note there are some run-to-run variance for these evaluations, so FP4 data is slight higher here. We think FP4 has comparable accuracy with FP8 on these datasets.
+
+The MoE layers inside this checkpoint have been quantized into FP4. Quantizing the MoE layer weights into FP4 has the following benefits:
+
+* Fully utilize the 5th generation Tensor Core FLOPS of the NVIDIA Blackwell GPUs
+* Reduce the memory load needs of the weights by almost half for MoE. Since the MoE parts are still memory bound for the decoding phase for the scenario, and 97% of the weights in the DeepSeek R1 model are from MoE layers.
+* Reduce the memory footprint of the model weights, thus freeing more GPU memories for KV cache and then increasing the max concurrency. [The original FP8 model checkpoint of the DeepSeek R1 model](https://huggingface.co/deepseek-ai/DeepSeek-R1) is about 640GB, while the NVIDIA provided [DeepSeek R1 FP4 quantized model](https://huggingface.co/nvidia/DeepSeek-R1-FP4) is only about 400 GB.
+
+The precision of FP8 KV cache and FP8 attention kernels are evaluated on the GSM8K dataset, with no obvious accuracy drops. For the accuracy numbers, please see the table in the FP8 KV cache section. Users can still opt-out to use BF16 KV cache and attention if on their dataset some accuracy differences are observed.
+
+## Parallel strategy
+
+The parallelism strategy for DeepSeek R1 throughput scenario is different from what is used for latency-oriented scenarios.
+
+| Components | Parallel Patterns |
+| :---- | :---- |
+| Attention Modules | Data Parallelism 8 (DP8) |
+| MoE Sparse Experts | Expert Parallelism 8 (EP8) |
+| MoE Shared Experts | DP8 |
+| Fuse_A GEMM | DP8 |
+| Router GEMM | DP8 |
+
+In the following sections we will explain the rationale why DP and EP are chosen and not using tensor parallel (TP).
+
+### Weights absorb and MQA
+
+The core idea of MLA is the low-rank joint compression for the attention keys and values to reduce KV-cache size during the inference. Based on the MLA formulas, the down-projected KV latent is up-projected to multiple heads and combined with the up-projected Q to establish a normal multi-head attention (MHA). Due to the nature of the matrix multiplication, the up projection weights matrix of the K (W^UK) can be multiplied by the up-projection weights matrix of Q (W^Q) firstly, the computed results of these 2 can be then multiplied to Q. The up-projection weights matrix of V (W^UV) and the attention output projection matrix W^O can also be multiplied after the attention output. The DeepSeek-V2 technical report calls this technique "absorb". After the weights are absorbed, the MLA is equivalent to multiple query attention(MQA). Please see the [original DeepSeek-V2 technical paper](https://arxiv.org/pdf/2405.04434) for the detailed formulas and explanations, the following block diagram shows the computational flow of weights absorbed MLA in TensorRT-LLM.
+![Weights Absorb](../media/tech_blog3_mla_absorb.png "Weights Absorbed MLA")
+
+For the decoding phase, the weights absorb significantly reduces the math FLOPS needed to up project the K and V, since the FLOPs needed for these up projections of KV are linear to the KV cache length, while length of Q vector is always 1 in the decoding phase. The longer the KV cache history is, the more FLOPs are needed, and the up projections are repeated for every decoded token since only the projected KV latent were saved, which further increases the FLOPs needed.
+For the prefill phase, the weights absorbed version changes the dimensions of Q and KV thus increasing the number of FLOPs for attention. Based on roofline analysis, non absorbed version is beneficial for the prefill phase with input length 256 or larger
+The TensorRT-LLM MLA implementation chooses different highly optimized kernels for prefill and decoding, see [MLA](../../../../tensorrt_llm/_torch/modules/attention.py).
+
+### Data Parallel for Attention module (ADP)
+
+The intuition of choosing attention DP is that doing TP for the MQA (where different GPUs compute different attention Q heads) will duplicate the KV cache memory, which limits the concurrency being achieved by the system. The duplication factor is equal to the TP group size, thus 8x for TP8. Small concurrency will hurt the throughput for the powerful system like NVIDIA DGX B200.
+
+For DeepSeek R1 FP4 checkpoint with 8 B200 GPUs, the weights and activation occupies about 80 GB memory for each GPU, and the free KV cache per GPU will be 100GB. Assuming ISL 1K, OSL 2K, each request will consume about 200MB KV cache, which results in a per GPU max concurrency of 500. A single node 8xGPU system has a global concurrency of 4000. When using attention TP, the global concurrency will become just 500.
+
+Silicon experiments show the attention DP technique provides a significant **400% speedup** in the max throughput cases, when keeping all other factors the same.
+
+### Expert parallel for MoE (EP)
+
+The DeepSeek R1 MoE design features 256 small sparse experts and 1 shared expert, the GEMM problem size of these experts are as follows.
+
+| GEMM | group | GEMM N | GEMM K |
+| :---- | :---- | :---- | :---- |
+| shared_fc1 | 1 | 4096 | 7168 |
+| shared_fc2 | 1 | 7168 | 2048 |
+| sparse_fc1 | 256 | 4096 | 7168 |
+| sparse_fc2 | 256 | 7168 | 2048 |
+
+These experts can be done in either Tensor-Parallelism or Expert-Parallelism ways. Our current ablation study reveals that Expert-Parallelism achieves better GEMM FLOPS because it has better GEMM problem sizes. And Expert-Parallelism can save GPU communication bandwidth compared to AllReduce, because the tokens only need to be sent to GPUs where the active experts for this token are located, while TP needs an AllReduce for all the tokens between all the GPUs. Also to be noted that, to scale the DeepSeek R1 inference to systems like GB200 NVL72 fully utilizing the aggregated memory bandwidth and tensor core flops, large EPs are needed. We are actively working on implementing it.
+
+Silicon performance measurements show that Expert-Parallelism can provide 142% speedup for 1K/2K max throughput case, when keeping other factors the same.
+
+## MLA Layers Optimizations
+
+Other than the parallel strategy and precision strategy we explained above, we have done the following optimizations for layers/kernels inside the MLA module.
+
+* Attention Kernels Optimization
+
+    This provided a **20% E2E speedup** compared to February baseline implementation. It involved implementing **high-throughput generation MLA kernels**. Techniques include using 2CTA Group variant of the Tensor Core 5th MMA instructions of Blackwell GPUs, overlapping MLA with softmax using interleaved tiles, and fine-tuning kernel selection heuristics for the DeepSeek R1 problem size.
+
+* FP8 KV Cache
+
+    An important optimization that yielded a **6% E2E throughput increase** when assuming the concurrency was identical. Another benefit of FP8 KV cache is **compressing the KV cache size by half**, which **allows for larger concurrency**. It also enables the use of faster FP8 attention kernels compared to BF16. We recommend that users always turn on FP8 KV cache to get better performance. In the context phase, KV is quantized to FP8 and saved to the KV cache pool. In the generation phase, both Q and KV are quantized to FP8, and FP8 Multi-Query Attention (MQA) is used. Evaluation on GSM8k showed **no meaningful accuracy drop**. The quantization typically uses static per-tensor FP8 with a scaling factor defaulting to 1.0, but KV cache scaling factor can also be generated by calibrating on a target dataset. Below are the accuracy metrics of different combinations on the GSM8K dataset.
+
+    | KV Cache Type | FP8 Checkpoint | FP4 Checkpoint |
+    | :---- | :---- | :---- |
+    | BF16 MLA and KV cache | 0.9629 | 0.9606 |
+    | FP8 MLA and KV cache | 0.9613 | 0.9606 |
+
+
+* Manual GEMM tactics tuning
+
+    This optimization addresses cases where the default heuristic algorithm in cuBLAS is not performing best for specific GEMM shapes existing in the model. We built an internal tool to find the best algorithm for these specific shapes offline and then used the `cublasLtMatmul` API to apply this specific, optimized algorithm at runtime. This is a necessary system optimization when general-purpose heuristics don't find the most efficient kernel for all specific cases. We are also working actively with the cuBLAS team to further enhance the heuristics such that the best performance can always be achieved OOTB. See [cublasScaledMM.cpp](https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/thop/cublasScaledMM.cpp#L54) for the tuning details.
+
+* Horizontal Fusions
+
+    This involves fusing GEMM operations of down projection of Q/KV and rope dimensions of K tensor. See [modeling_deepseekv3.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L1305) for details. Horizontal fusion reduces the kernel launch overhead and increases the GEMM problem sizes which can achieve better HW utilization. It is a common technique shared by both min-latency and throughput optimizations.
+
+* 2-stream optimizations
+
+    There are some small operations which can be run in parallel like the Q norm and KV norm inside the MLA. These operations cannot fully utilize the GPU math flops and the memory bandwidth, thus running in parallel CUDA streams can bring speed-up.
+
+## MoE Layers Optimizations
+
+The following optimizations are already done for MoE layers.
+
+* Mix I/O data type for the router GEMM
+
+    Achieved a **4% E2E speedup** by avoiding casting operations and performing the GEMM using a mixture of input and output data types (e.g., BF16 input and FP32 output) directly. This eliminates the need to explicitly cast inputs to the output type and saves memory bandwidth.
+
+* Top-K Kernels Fusions
+
+    Resulted in a **7.4% E2E speedup**. For DeepSeek R1, selecting the top 8 experts from 256 is done in a two-phase approach: first selecting top groups, then finding the top 8 within those groups. DeepSeek R1 uses some additional techniques for better expert load balance which involves adding bias and scales to the topK complications. All these operations resulted in 18 PyTorch ops when not fused, see [Deepseekv3RoutingImpl](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L213). Fusing the multiple kernels involved in these Top-K calculations significantly reduces the overall computation time. Compared to using 18 native PyTorch ops, fusion can reduce the operation to as few as 2 kernels. Based on the measurement on B200, fusing these kernels can reduce the kernel time from 252us to 15us in the target setting.
+
+* FP4 AllGather Optimizations
+
+    Showed a **4% E2E speedup**. This optimization replaces the BF16 AllGather operation with an FP4 version. Using a lower precision for this communication primitive reduces the amount of data transferred over the network, significantly improving communication efficiency. Also, since the original BF16 Tensor to be transferred will get cast into FP4 format after the AllGather communication, this optimization will not bring any impact to the accuracy. At the kernel level, we are seeing about 3x when switching from BF16 to FP4 AllGather.
+
+* CUTLASS Group GEMM optimizations
+
+    Provided a **1.3% E2E speedup**. There are some CUTLASS level optimizations shared by both min-latency and throughput cases. Just updating CUTLASS to the latest version gives us 13% kernel improvement for the MoE groupGemm, and resulted in +1.3% E2E TPS/GPU.
+
+* Multi-stream optimizations
+    Running the shared and routed experts in 2 streams combined with other multi-streaming optimizations in the MLA modules, contributing a **5.3% E2E speedup**.
+
+## Runtime Optimizations
+
+These optimizations target the overall execution flow, scheduling, and resource management within the inference system. They are shared between DeepSeek R1 models and other models supported in the TensorRT-LLM, here we are sharing some ablation study for the performance benefits on DeepSeek R1 on B200.
+
+* CUDA Graph
+
+    This had a significant **22% E2E performance impact** for throughput scenarios. CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed. There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation. Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_padding_enabled` to false, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
+
+* Overlap Scheduler:
+
+    Showed a **4% E2E performance impact** and should generally **always be used**. This scheduler manages the execution of different operations (like computation and communication) to overlap them effectively on the GPU and network. The intuition is to hide latency by performing computation while waiting for data transfers or vice versa, improving overall hardware utilization. The overlap schedule is already defaulted on in TensorRT-LLM by [commit](https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428#diff-3c4f29d6594b37af0f1fbb97f5291b18e49f3f2510f9d296c7adb2829e9da0bf). In case there are corner cases where it does not work, users can still opt-out this feature by set *disable_overlap_scheduler* to true.
+
+* Memory Optimizations
+
+    Resulted in a **4GB improvement**. This includes techniques like chunked MoE (specifically for Hopper) and fixing a cuda context init bug. These methods reduce the memory footprint of the model weights or intermediate tensors, allowing for larger batch sizes or sequence lengths, and preventing Out-of-Memory (OOM) errors.
+
+## How to reproduce
+
+See [Perf practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md#b200-max-throughput)
+
+## Future Works
+
+- Large EP
+- Chunked context
+- More communication overlap
+
+## Acknowledgment
+
+The substantial throughput advancements for DeepSeek R1 on Blackwell GPUs, as detailed in this post, are the fruit of a dedicated and collaborative engineering effort. Achieving nearly a 2.3x increase in TPS/GPU required a deep dive into MLA layers, MoE layers, and runtime optimizations. We extend our sincere appreciation to all the engineers involved in this intensive optimization process. Their collective expertise in pushing the boundaries of throughput performance within TensorRT-LLM has been instrumental. We trust that sharing these specific strategies for maximizing throughput will prove beneficial to the developer community as they tackle demanding LLM inference workloads on NVIDIA hardware.
diff --git a/latest/_sources/examples/index.rst.txt b/latest/_sources/examples/index.rst.txt
index bb1e96f52d..bc863a197d 100644
--- a/latest/_sources/examples/index.rst.txt
+++ b/latest/_sources/examples/index.rst.txt
@@ -20,6 +20,7 @@ The LLM API can be used for both offline or online usage. See more examples of t
     llm_inference_async
     llm_inference_distributed
     llm_logits_processor
+    llm_eagle2_decoding
     llm_inference_kv_events
     llm_lookahead_decoding
     llm_quantization
diff --git a/latest/_sources/examples/llm_api_examples.rst.txt b/latest/_sources/examples/llm_api_examples.rst.txt
index d515fce884..e6d788e739 100644
--- a/latest/_sources/examples/llm_api_examples.rst.txt
+++ b/latest/_sources/examples/llm_api_examples.rst.txt
@@ -11,6 +11,7 @@ LLM Examples
    llm_inference_async
    llm_inference_distributed
    llm_logits_processor
+   llm_eagle2_decoding
    llm_inference_kv_events
    llm_lookahead_decoding
    llm_quantization
diff --git a/latest/_sources/examples/llm_eagle2_decoding.rst.txt b/latest/_sources/examples/llm_eagle2_decoding.rst.txt
new file mode 100644
index 0000000000..54e5d91079
--- /dev/null
+++ b/latest/_sources/examples/llm_eagle2_decoding.rst.txt
@@ -0,0 +1,8 @@
+Generate Text Using Eagle2 Decoding
+===================================
+
+Source https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_eagle2_decoding.py.
+
+.. literalinclude:: ../../../examples/llm-api/llm_eagle2_decoding.py
+    :language: python
+    :linenos:
diff --git a/latest/_sources/index.rst.txt b/latest/_sources/index.rst.txt
index 06c8b3f26d..b95dca8d0d 100644
--- a/latest/_sources/index.rst.txt
+++ b/latest/_sources/index.rst.txt
@@ -104,6 +104,7 @@ Welcome to TensorRT-LLM's Documentation!
    advanced/inference-request.md
    advanced/lora.md
    advanced/expert-parallelism.md
+   advanced/kv-cache-management.md
    advanced/kv-cache-reuse.md
    advanced/speculative-decoding.md
    advanced/disaggregated-service.md
@@ -144,6 +145,7 @@ Welcome to TensorRT-LLM's Documentation!
    blogs/quantization-in-TRT-LLM.md
    blogs/XQA-kernel.md
    blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md
+   blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
 
 
 Indices and tables
diff --git a/latest/_sources/llm-api/reference.rst.txt b/latest/_sources/llm-api/reference.rst.txt
index 8115c5e669..b2147540d6 100644
--- a/latest/_sources/llm-api/reference.rst.txt
+++ b/latest/_sources/llm-api/reference.rst.txt
@@ -131,3 +131,23 @@ API Reference
     :undoc-members:
     :special-members: __init__
     :show-inheritance:
+.. autoclass:: tensorrt_llm.llmapi.NGramDecodingConfig
+    :members:
+    :undoc-members:
+    :special-members: __init__
+    :show-inheritance:
+.. autoclass:: tensorrt_llm.llmapi.LlmArgs
+    :members:
+    :undoc-members:
+    :special-members: __init__
+    :show-inheritance:
+.. autoclass:: tensorrt_llm.llmapi.TorchLlmArgs
+    :members:
+    :undoc-members:
+    :special-members: __init__
+    :show-inheritance:
+.. autoclass:: tensorrt_llm.llmapi.TrtLlmArgs
+    :members:
+    :undoc-members:
+    :special-members: __init__
+    :show-inheritance:
diff --git a/latest/_sources/performance/perf-benchmarking.md.txt b/latest/_sources/performance/perf-benchmarking.md.txt
index 8a5daf4ec1..990eefb477 100644
--- a/latest/_sources/performance/perf-benchmarking.md.txt
+++ b/latest/_sources/performance/perf-benchmarking.md.txt
@@ -628,8 +628,7 @@ If you would like to force the KV cache quantizaton, you can specify the followi
 when the checkpoint precision is `null`:
 
 ```yaml
-pytorch_backend_config:
-  kv_cache_dtype: "fp8"
+kv_cache_dtype: "fp8"
 ```
 
 ```{tip}
diff --git a/latest/_sources/performance/perf-overview.md.txt b/latest/_sources/performance/perf-overview.md.txt
index ae751112d9..2cf4204d2f 100644
--- a/latest/_sources/performance/perf-overview.md.txt
+++ b/latest/_sources/performance/perf-overview.md.txt
@@ -200,12 +200,9 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
 
 `llm_options.yml`
 ```yaml
-
- pytorch_backend_config:
-  enable_overlap_scheduler: true
-  use_cuda_graph: true
-  cuda_graph_padding_enabled: true
-  cuda_graph_batch_sizes:
+use_cuda_graph: true
+cuda_graph_padding_enabled: true
+cuda_graph_batch_sizes:
   - 1
   - 2
   - 4
diff --git a/latest/_sources/torch/attention.md.txt b/latest/_sources/torch/attention.md.txt
index a462465041..2cde32ae90 100644
--- a/latest/_sources/torch/attention.md.txt
+++ b/latest/_sources/torch/attention.md.txt
@@ -16,7 +16,7 @@ The following sections explain how to use these implementations and provide a br
 
 
 There are currently three available attention backends: the vanilla backend, the TRT-LLM backend, and the Flashinfer backend.
-You can specify the desired attention backend using `PyTorchConfig.attn_backend`. For instance, to utilize the Flashinfer backend, you can create a `PyTorchConfig` with `attn_backend = "flashinfer"` and then pass it to the `LLM` constructor as follows: `LLM(pytorch_backend_config=pytorch_config)`. This will enable the use of the Flashinfer backend for your model.
+You can specify the desired attention backend using `PyTorchConfig.attn_backend`. For instance, to utilize the Flashinfer backend, you can pass `attn_backend="flashinfer"` to the `LLM` constructor as follows: `LLM(attn_backend="flashinfer")`. This will enable the use of the Flashinfer backend for your model.
 
 The vanilla backend, `VanillaAttention`, is a reference implementation designed primarily for inflight batching and linear KV cache support. While it serves as a useful baseline, it is not recommended for production use due to its limited optimizations.
 
diff --git a/latest/_sources/torch/kv_cache_manager.md.txt b/latest/_sources/torch/kv_cache_manager.md.txt
index b9d07f1a11..cbe2e3e40f 100644
--- a/latest/_sources/torch/kv_cache_manager.md.txt
+++ b/latest/_sources/torch/kv_cache_manager.md.txt
@@ -4,6 +4,8 @@ In Transformer-based models, the KV (Key-Value) Cache is a mechanism used to opt
 Since KV Cache requires memory to store, it is also an important resource.
 In TensorRT-LLM, KV Cache is managed by the `KVCacheManager`.
 
+For details of the TensorRT-LLM `KVCacheManager` implementation see [KV Cache Management](../advanced/kv-cache-management.md).
+
 ## KV Cache Manager Introduction
 
 `KVCacheManager` is a type of resource manager, inheriting from `BaseResourceManager`.
diff --git a/latest/advanced/disaggregated-service.html b/latest/advanced/disaggregated-service.html
index 131f0d673f..34b99a4b6e 100644
--- a/latest/advanced/disaggregated-service.html
+++ b/latest/advanced/disaggregated-service.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -801,6 +805,15 @@ export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/advanced/executor.html b/latest/advanced/executor.html
index 4331a9f678..7a10cd2e85 100644
--- a/latest/advanced/executor.html
+++ b/latest/advanced/executor.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -806,6 +810,15 @@ the TensorRT-LLM C++ Executor API.</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/advanced/expert-parallelism.html b/latest/advanced/expert-parallelism.html
index 44d1070e36..fbd8066e74 100644
--- a/latest/advanced/expert-parallelism.html
+++ b/latest/advanced/expert-parallelism.html
@@ -51,19 +51,19 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
     <link rel="icon" href="../_static/favicon.png"/>
     <link rel="index" title="Index" href="../genindex.html" />
     <link rel="search" title="Search" href="../search.html" />
-    <link rel="next" title="KV cache reuse" href="kv-cache-reuse.html" />
+    <link rel="next" title="KV Cache Management: Pools, Blocks, and Events" href="kv-cache-management.html" />
     <link rel="prev" title="Run gpt-2b + LoRA using Executor / cpp runtime" href="lora.html" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -550,11 +554,11 @@
       </div>
     </a>
     <a class="right-next"
-       href="kv-cache-reuse.html"
+       href="kv-cache-management.html"
        title="next page">
       <div class="prev-next-info">
         <p class="prev-next-subtitle">next</p>
-        <p class="prev-next-title">KV cache reuse</p>
+        <p class="prev-next-title">KV Cache Management: Pools, Blocks, and Events</p>
       </div>
       <i class="fa-solid fa-angle-right"></i>
     </a>
@@ -673,6 +677,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/advanced/gpt-attention.html b/latest/advanced/gpt-attention.html
index e302883ab9..ad3ec90def 100644
--- a/latest/advanced/gpt-attention.html
+++ b/latest/advanced/gpt-attention.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -991,6 +995,15 @@ is computed as:</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/advanced/gpt-runtime.html b/latest/advanced/gpt-runtime.html
index 093133c62d..b2d8b3a4b2 100644
--- a/latest/advanced/gpt-runtime.html
+++ b/latest/advanced/gpt-runtime.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1031,6 +1035,15 @@ The <code class="docutils literal notranslate"><span class="pre">GptDecoder</spa
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/advanced/graph-rewriting.html b/latest/advanced/graph-rewriting.html
index 6cc3b894ad..2586085da9 100644
--- a/latest/advanced/graph-rewriting.html
+++ b/latest/advanced/graph-rewriting.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -852,6 +856,15 @@ techniques to optimize the underlying graph.  It provides a wrapper similar to P
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/advanced/kv-cache-management.html b/latest/advanced/kv-cache-management.html
new file mode 100644
index 0000000000..5f5065d0db
--- /dev/null
+++ b/latest/advanced/kv-cache-management.html
@@ -0,0 +1,781 @@
+
+
+<!DOCTYPE html>
+
+
+<html lang="en" data-content_root="../" >
+
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+
+    <title>KV Cache Management: Pools, Blocks, and Events &#8212; TensorRT-LLM</title>
+  
+  
+  
+  <script data-cfasync="false">
+    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
+    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
+  </script>
+  <!--
+    this give us a css class that will be invisible only if js is disabled
+  -->
+  <noscript>
+    <style>
+      .pst-js-only { display: none !important; }
+
+    </style>
+  </noscript>
+  
+  <!-- Loaded before other Sphinx assets -->
+  <link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+<link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+
+    <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/nvidia-sphinx-theme.css?v=df3ac72c" />
+    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
+    <link rel="stylesheet" type="text/css" href="../_static/autodoc_pydantic.css" />
+  
+  <!-- So that users can add custom icons -->
+  <script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
+  <!-- Pre-loaded scripts that we'll load fully later -->
+  <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
+<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
+
+    <script src="../_static/documentation_options.js?v=5929fcd5"></script>
+    <script src="../_static/doctools.js?v=9a2dae69"></script>
+    <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
+    <script src="../_static/clipboard.min.js?v=a7894cd8"></script>
+    <script src="../_static/copybutton.js?v=65e89d2a"></script>
+    <script>DOCUMENTATION_OPTIONS.pagename = 'advanced/kv-cache-management';</script>
+    <script>
+        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
+        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
+        DOCUMENTATION_OPTIONS.show_version_warning_banner =
+            false;
+        </script>
+    <link rel="icon" href="../_static/favicon.png"/>
+    <link rel="index" title="Index" href="../genindex.html" />
+    <link rel="search" title="Search" href="../search.html" />
+    <link rel="next" title="KV cache reuse" href="kv-cache-reuse.html" />
+    <link rel="prev" title="Expert Parallelism in TensorRT-LLM" href="expert-parallelism.html" />
+
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <meta name="docsearch:language" content="en"/>
+  <meta name="docsearch:version" content="0.21.0rc0" />
+
+
+  </head>
+  
+  
+  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
+
+  
+  
+  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
+  
+  <div id="pst-scroll-pixel-helper"></div>
+  
+  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
+    <i class="fa-solid fa-arrow-up"></i>Back to top</button>
+
+  
+  <dialog id="pst-search-dialog">
+    
+<form class="bd-search d-flex align-items-center"
+      action="../search.html"
+      method="get">
+  <i class="fa-solid fa-magnifying-glass"></i>
+  <input type="search"
+         class="form-control"
+         name="q"
+         placeholder="Search the docs ..."
+         aria-label="Search the docs ..."
+         autocomplete="off"
+         autocorrect="off"
+         autocapitalize="off"
+         spellcheck="false"/>
+  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
+</form>
+  </dialog>
+
+  <div class="pst-async-banner-revealer d-none">
+  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
+</div>
+
+  
+    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
+<div class="bd-header__inner bd-page-width">
+  <button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
+    <span class="fa-solid fa-bars"></span>
+  </button>
+  
+  
+  <div class="col-lg-3 navbar-header-items__start">
+    
+      <div class="navbar-item">
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a></div>
+    
+  </div>
+  
+  <div class="col-lg-9 navbar-header-items">
+    
+    <div class="me-auto navbar-header-items__center">
+      
+        <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-2"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-2"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-2"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-2">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+      
+    </div>
+    
+    
+    <div class="navbar-header-items__end">
+      
+        <div class="navbar-item navbar-persistent--container">
+          
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+        </div>
+      
+      
+        <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+      
+    </div>
+    
+  </div>
+  
+  
+    <div class="navbar-persistent--mobile">
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+    </div>
+  
+
+  
+    <button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
+      <span class="fa-solid fa-outdent"></span>
+    </button>
+  
+</div>
+
+    </header>
+  
+
+  <div class="bd-container">
+    <div class="bd-container__inner bd-page-width">
+      
+      
+      
+      <dialog id="pst-primary-sidebar-modal"></dialog>
+      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
+        
+
+
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a>
+
+
+  
+  <div class="sidebar-header-items sidebar-primary__section">
+    
+    
+      <div class="sidebar-header-items__center">
+        
+          
+          
+            <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-3"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-3"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-3"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-3">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+          
+        
+      </div>
+    
+    
+    
+      <div class="sidebar-header-items__end">
+        
+          <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+        
+      </div>
+    
+  </div>
+  
+    <div class="sidebar-primary-items__start sidebar-primary__section">
+        <div class="sidebar-primary-item">
+
+
+
+<nav class="bd-docs-nav bd-links"
+     aria-label="Table of Contents">
+  <p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
+  <div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../torch.html">PyTorch Backend</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Installation</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">LLM API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Introduction</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="../examples/customization.html">LLM Common Customizations</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client.html">Curl Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/curl_completion_client.html">Curl Completion Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client.html">Genai Perf Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
+</ul>
+</details></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
+
+
+
+<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Advanced</span></p>
+<ul class="current nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
+<li class="toctree-l1"><a class="reference internal" href="gpt-runtime.html">C++ GPT Runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="executor.html">Executor API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
+<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1 current active"><a class="current reference internal" href="#">KV Cache Management: Pools, Blocks, and Events</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
+<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../performance/performance-tuning-guide/index.html">Performance Tuning Guide</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/benchmarking-default-performance.html">Benchmarking Default Performance</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-build-time-flags.html">Useful Build-Time Flags</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html">Tuning Max Batch Size and Max Num Tokens</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/deciding-model-sharding-strategy.html">Deciding Model Sharding Strategy</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/fp8-quantization.html">FP8 Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-runtime-flags.html">Useful Runtime Options</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
+</ul>
+</div>
+</nav></div>
+    </div>
+  
+  
+  <div class="sidebar-primary-items__end sidebar-primary__section">
+  </div>
+
+
+
+      </div>
+      
+      <main id="main-content" class="bd-main" role="main">
+        
+        
+          <div class="bd-content">
+            <div class="bd-article-container">
+              
+              <div class="bd-header-article d-print-none">
+<div class="header-article-items header-article__inner">
+  
+    <div class="header-article-items__start">
+      
+        <div class="header-article-item">
+
+<nav aria-label="Breadcrumb" class="d-print-none">
+  <ul class="bd-breadcrumbs">
+    
+    <li class="breadcrumb-item breadcrumb-home">
+      <a href="../index.html" class="nav-link" aria-label="Home">
+        <i class="fa-solid fa-home"></i>
+      </a>
+    </li>
+    <li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">KV Cache Management: Pools, Blocks, and Events</span></li>
+  </ul>
+</nav>
+</div>
+      
+    </div>
+  
+  
+</div>
+</div>
+              
+              
+              
+                
+<div id="searchbox"></div>
+                <article class="bd-article">
+                  
+  <section id="kv-cache-management-pools-blocks-and-events">
+<span id="kv-cache-management"></span><h1>KV Cache Management: Pools, Blocks, and Events<a class="headerlink" href="#kv-cache-management-pools-blocks-and-events" title="Link to this heading">#</a></h1>
+<p>This document provides an overview of the internal hierarchy and event system for paged KV cache management, as implemented in the TensorRT-LLM codebase.</p>
+<p>For more information on KV cache reuse see <a class="reference internal" href="kv-cache-reuse.html"><span class="std std-doc">KV cache reuse</span></a>.</p>
+<hr class="docutils" />
+<section id="hierarchy-pool-block-and-page">
+<h2>Hierarchy: Pool, Block, and Page<a class="headerlink" href="#hierarchy-pool-block-and-page" title="Link to this heading">#</a></h2>
+<section id="block">
+<h3><strong>Block</strong><a class="headerlink" href="#block" title="Link to this heading">#</a></h3>
+<ul class="simple">
+<li><p><strong>Definition:</strong> The smallest unit of KV cache allocation. A <code class="docutils literal notranslate"><span class="pre">KVCacheBlock</span></code> holds metadata (not the actual data) for a chunk of KV cache.</p></li>
+<li><p><strong>Purpose:</strong> Each block represents a fixed number of tokens’ worth of KV data (can be specified by <code class="docutils literal notranslate"><span class="pre">tokens_per_block</span></code> parameter).</p></li>
+<li><p><strong>Usage:</strong> Blocks are allocated, reused, or evicted as sequences are processed.</p></li>
+</ul>
+</section>
+<section id="page">
+<h3><strong>Page</strong><a class="headerlink" href="#page" title="Link to this heading">#</a></h3>
+<ul class="simple">
+<li><p><strong>Definition:</strong> In this codebase, “page” is often used interchangeably with “block” (as in “paged KV cache”), but technically, a page could refer to a memory page (hardware-level), while a block is a logical unit for the cache.</p></li>
+<li><p><strong>In Practice:</strong> The code uses “block” as the main unit; “page” is not a distinct class or struct.</p></li>
+</ul>
+</section>
+<section id="pool">
+<h3><strong>Pool</strong><a class="headerlink" href="#pool" title="Link to this heading">#</a></h3>
+<ul class="simple">
+<li><p><strong>Definition:</strong> A pool is a contiguous memory buffer (or set of buffers) that holds the actual KV data for one or more layers.</p></li>
+<li><p><strong>Types:</strong> There are primary pools (fast GPU memory) and secondary pools (slower, e.g., CPU or offload memory).</p></li>
+<li><p><strong>Organization:</strong> Each pool can serve multiple layers that share the same KV head configuration. Pools are managed by <code class="docutils literal notranslate"><span class="pre">KVCacheBlockPool</span></code> and tracked in vectors in <code class="docutils literal notranslate"><span class="pre">WindowBlockManager</span></code>.</p></li>
+<li><p><strong>Block ↔ Pool:</strong> Each block is an index into a pool; the pool provides the actual storage, while the block is the metadata handle.</p></li>
+</ul>
+</section>
+<section id="windowblockmanager-blockmanager">
+<h3><strong>WindowBlockManager/BlockManager</strong><a class="headerlink" href="#windowblockmanager-blockmanager" title="Link to this heading">#</a></h3>
+<p>TRT-LLM supports 2 complex features related to KV cache management:</p>
+<ol class="arabic simple">
+<li><p><strong>Variable Group-Query Attention (VGQA)</strong> - i.e. a different <code class="docutils literal notranslate"><span class="pre">num_kv_heads</span></code> value for different layers.</p></li>
+<li><p><strong>Variable Sliding Window Attention (VSWA)</strong> - i.e. a different <code class="docutils literal notranslate"><span class="pre">attention_window_size</span></code> value for different layers.</p></li>
+</ol>
+<p>In order to support both of these features, the pool management works as described below.</p>
+<p>But in the simple, <em>most common case</em>, for most models, where</p>
+<ol class="arabic simple">
+<li><p><a class="reference internal" href="gpt-attention.html#multi-head-multi-query-and-group-query-attention"><span class="std std-ref">MHA/MQA/Non-variable GQA</span></a>, i.e., same <code class="docutils literal notranslate"><span class="pre">num_kv_heads</span></code> value for all layers,</p></li>
+<li><p>Global attention/<a class="reference internal" href="gpt-attention.html#sliding-window-attention-cyclic-rolling-buffer-kv-cache"><span class="std std-ref">SWA</span></a>, i.e., same <code class="docutils literal notranslate"><span class="pre">attention_window_size</span></code> value for all layers,</p></li>
+</ol>
+<p>only a <em>single</em> pool will be created within the structure described below.</p>
+<section id="kv-cache-pool-management">
+<h4>KV Cache Pool Management<a class="headerlink" href="#kv-cache-pool-management" title="Link to this heading">#</a></h4>
+<ul class="simple">
+<li><p><strong>WindowBlockManager:</strong> Manages blocks and pools for a specific attention window size. Within a <code class="docutils literal notranslate"><span class="pre">WindowBlockManager</span></code>, there can be multiple pools - each corresponding a unique number of KV heads - i.e., to support VGQA.</p></li>
+<li><p><strong>BlockManager:</strong> Manages all <code class="docutils literal notranslate"><span class="pre">WindowBlockManager</span></code> instances, one per unique window size.</p></li>
+</ul>
+<p><strong>Hierarchy Summary:</strong></p>
+<ul class="simple">
+<li><p><strong>Pool</strong> (memory buffer for KV data)</p>
+<ul>
+<li><p>Contains many blocks.</p></li>
+</ul>
+</li>
+<li><p><strong>Blocks</strong> (metadata for a chunk of the pool, each block = <code class="docutils literal notranslate"><span class="pre">tokens_per_block</span></code> tokens)</p>
+<ul>
+<li><p>(Optionally, blocks can be swapped between primary/secondary pools.)</p></li>
+</ul>
+</li>
+<li><p><strong>BlockManager/WindowBlockManager</strong>: Manage pools and blocks, handle allocation, reuse, and eviction.</p></li>
+</ul>
+</section>
+</section>
+</section>
+<hr class="docutils" />
+<section id="events-in-kvcacheeventmanager">
+<h2>Events in <code class="docutils literal notranslate"><span class="pre">KVCacheEventManager</span></code><a class="headerlink" href="#events-in-kvcacheeventmanager" title="Link to this heading">#</a></h2>
+<p>The <code class="docutils literal notranslate"><span class="pre">KVCacheEventManager</span></code> is responsible for tracking and reporting significant changes in the state of the KV cache. Events are used for logging, debugging, or possibly for external monitoring.</p>
+<section id="types-of-events">
+<h3><strong>Types of Events</strong><a class="headerlink" href="#types-of-events" title="Link to this heading">#</a></h3>
+<ul class="simple">
+<li><p><strong>Created Event:</strong> When pools or blocks are created/allocated.</p></li>
+<li><p><strong>Updated Event:</strong> When a block’s state changes (e.g., moved between primary/secondary, priority updated).</p></li>
+<li><p><strong>Removed Event:</strong> When a block is removed from the cache (evicted or released).</p></li>
+<li><p><strong>Stored Event:</strong> When blocks are stored for potential reuse (e.g., after a sequence finishes and its blocks are reusable).</p></li>
+</ul>
+</section>
+<section id="what-triggers-an-event">
+<h3><strong>What Triggers an Event?</strong><a class="headerlink" href="#what-triggers-an-event" title="Link to this heading">#</a></h3>
+<ul class="simple">
+<li><p><strong>Allocation/Deallocation:</strong> Creating or freeing memory pools or blocks.</p></li>
+<li><p><strong>Eviction/Reuse:</strong> When a block is evicted, reused, or its priority changes.</p></li>
+<li><p><strong>Block Movement:</strong> When a block is moved between memory levels (primary ↔ secondary).</p></li>
+<li><p><strong>Block Storage:</strong> When blocks are stored for future reuse (e.g., after a sequence completes).</p></li>
+</ul>
+<p><strong>In summary:</strong>
+An “event” is any significant change in the lifecycle or state of a KV cache block or pool, tracked for monitoring, debugging, or optimization purposes.</p>
+<hr class="docutils" />
+</section>
+</section>
+</section>
+
+
+                </article>
+              
+              
+              
+              
+              
+                <footer class="prev-next-footer d-print-none">
+                  
+<div class="prev-next-area">
+    <a class="left-prev"
+       href="expert-parallelism.html"
+       title="previous page">
+      <i class="fa-solid fa-angle-left"></i>
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">previous</p>
+        <p class="prev-next-title">Expert Parallelism in TensorRT-LLM</p>
+      </div>
+    </a>
+    <a class="right-next"
+       href="kv-cache-reuse.html"
+       title="next page">
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">next</p>
+        <p class="prev-next-title">KV cache reuse</p>
+      </div>
+      <i class="fa-solid fa-angle-right"></i>
+    </a>
+</div>
+                </footer>
+              
+            </div>
+            
+            
+
+
+              
+                <dialog id="pst-secondary-sidebar-modal"></dialog>
+                <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
+
+
+  <div class="sidebar-secondary-item">
+<div
+    id="pst-page-navigation-heading-2"
+    class="page-toc tocsection onthispage">
+    <i class="fa-solid fa-list"></i> On this page
+  </div>
+  <nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
+    <ul class="visible nav section-nav flex-column">
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#hierarchy-pool-block-and-page">Hierarchy: Pool, Block, and Page</a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#block"><strong>Block</strong></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#page"><strong>Page</strong></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#pool"><strong>Pool</strong></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#windowblockmanager-blockmanager"><strong>WindowBlockManager/BlockManager</strong></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#kv-cache-pool-management">KV Cache Pool Management</a></li>
+</ul>
+</li>
+</ul>
+</li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#events-in-kvcacheeventmanager">Events in <code class="docutils literal notranslate"><span class="pre">KVCacheEventManager</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#types-of-events"><strong>Types of Events</strong></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#what-triggers-an-event"><strong>What Triggers an Event?</strong></a></li>
+</ul>
+</li>
+</ul>
+  </nav></div>
+
+</div></div>
+              
+            
+
+          </div>
+          <footer class="bd-footer-content">
+            
+          </footer>
+        
+      </main>
+    </div>
+  </div>
+  
+  <!-- Scripts loaded after <body> so the DOM is not blocked -->
+  <script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
+<script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
+
+  <footer class="bd-footer">
+<div class="bd-footer__inner bd-page-width">
+  
+    <div class="footer-items__start">
+      
+        <div class="footer-item">
+<a class="footer-brand logo" href="https://www.nvidia.com">
+  <img src="../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
+  <img src="../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
+</a></div>
+      
+        <div class="footer-item">
+
+<div class="footer-links">
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Manage My Privacy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/preferences/start/">Do Not Sell or Share My Data</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>
+  
+  
+  
+</div>
+</div>
+      
+        <div class="footer-item">
+
+
+
+
+  <p class="copyright">
+    
+      Copyright © 2025, NVidia.
+      <br/>
+    
+  </p>
+</div>
+      
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
+    </div>
+  
+  
+  
+</div>
+
+  </footer>
+  </body>
+</html>
\ No newline at end of file
diff --git a/latest/advanced/kv-cache-reuse.html b/latest/advanced/kv-cache-reuse.html
index a2f802dbdc..619e464383 100644
--- a/latest/advanced/kv-cache-reuse.html
+++ b/latest/advanced/kv-cache-reuse.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -59,11 +59,11 @@
     <link rel="index" title="Index" href="../genindex.html" />
     <link rel="search" title="Search" href="../search.html" />
     <link rel="next" title="Speculative Sampling" href="speculative-decoding.html" />
-    <link rel="prev" title="Expert Parallelism in TensorRT-LLM" href="expert-parallelism.html" />
+    <link rel="prev" title="KV Cache Management: Pools, Blocks, and Events" href="kv-cache-management.html" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -594,12 +598,12 @@ Assume vocabulary size is 100, which means normal text token ids are in range [0
                   
 <div class="prev-next-area">
     <a class="left-prev"
-       href="expert-parallelism.html"
+       href="kv-cache-management.html"
        title="previous page">
       <i class="fa-solid fa-angle-left"></i>
       <div class="prev-next-info">
         <p class="prev-next-subtitle">previous</p>
-        <p class="prev-next-title">Expert Parallelism in TensorRT-LLM</p>
+        <p class="prev-next-title">KV Cache Management: Pools, Blocks, and Events</p>
       </div>
     </a>
     <a class="right-next"
@@ -730,6 +734,15 @@ Assume vocabulary size is 100, which means normal text token ids are in range [0
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/advanced/lora.html b/latest/advanced/lora.html
index 974773b158..81f4f71e6e 100644
--- a/latest/advanced/lora.html
+++ b/latest/advanced/lora.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -844,6 +848,15 @@ The shape of <code class="docutils literal notranslate"><span class="pre">LoraWe
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/advanced/lowprecision-pcie-allreduce.html b/latest/advanced/lowprecision-pcie-allreduce.html
new file mode 100644
index 0000000000..f3b84da6f4
--- /dev/null
+++ b/latest/advanced/lowprecision-pcie-allreduce.html
@@ -0,0 +1,725 @@
+
+
+<!DOCTYPE html>
+
+
+<html lang="en" data-content_root="../" >
+
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+
+    <title>Low-Precision-AllReduce &#8212; TensorRT-LLM</title>
+  
+  
+  
+  <script data-cfasync="false">
+    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
+    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
+  </script>
+  <!--
+    this give us a css class that will be invisible only if js is disabled
+  -->
+  <noscript>
+    <style>
+      .pst-js-only { display: none !important; }
+
+    </style>
+  </noscript>
+  
+  <!-- Loaded before other Sphinx assets -->
+  <link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+<link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+
+    <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/nvidia-sphinx-theme.css?v=df3ac72c" />
+    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
+    <link rel="stylesheet" type="text/css" href="../_static/autodoc_pydantic.css" />
+  
+  <!-- So that users can add custom icons -->
+  <script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
+  <!-- Pre-loaded scripts that we'll load fully later -->
+  <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
+<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
+
+    <script src="../_static/documentation_options.js?v=5929fcd5"></script>
+    <script src="../_static/doctools.js?v=9a2dae69"></script>
+    <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
+    <script src="../_static/clipboard.min.js?v=a7894cd8"></script>
+    <script src="../_static/copybutton.js?v=65e89d2a"></script>
+    <script>DOCUMENTATION_OPTIONS.pagename = 'advanced/lowprecision-pcie-allreduce';</script>
+    <script>
+        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
+        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
+        DOCUMENTATION_OPTIONS.show_version_warning_banner =
+            false;
+        </script>
+    <link rel="icon" href="../_static/favicon.png"/>
+    <link rel="index" title="Index" href="../genindex.html" />
+    <link rel="search" title="Search" href="../search.html" />
+
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <meta name="docsearch:language" content="en"/>
+  <meta name="docsearch:version" content="0.21.0rc0" />
+
+
+  </head>
+  
+  
+  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
+
+  
+  
+  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
+  
+  <div id="pst-scroll-pixel-helper"></div>
+  
+  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
+    <i class="fa-solid fa-arrow-up"></i>Back to top</button>
+
+  
+  <dialog id="pst-search-dialog">
+    
+<form class="bd-search d-flex align-items-center"
+      action="../search.html"
+      method="get">
+  <i class="fa-solid fa-magnifying-glass"></i>
+  <input type="search"
+         class="form-control"
+         name="q"
+         placeholder="Search the docs ..."
+         aria-label="Search the docs ..."
+         autocomplete="off"
+         autocorrect="off"
+         autocapitalize="off"
+         spellcheck="false"/>
+  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
+</form>
+  </dialog>
+
+  <div class="pst-async-banner-revealer d-none">
+  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
+</div>
+
+  
+    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
+<div class="bd-header__inner bd-page-width">
+  <button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
+    <span class="fa-solid fa-bars"></span>
+  </button>
+  
+  
+  <div class="col-lg-3 navbar-header-items__start">
+    
+      <div class="navbar-item">
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a></div>
+    
+  </div>
+  
+  <div class="col-lg-9 navbar-header-items">
+    
+    <div class="me-auto navbar-header-items__center">
+      
+        <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-2"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-2"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-2"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-2">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+      
+    </div>
+    
+    
+    <div class="navbar-header-items__end">
+      
+        <div class="navbar-item navbar-persistent--container">
+          
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+        </div>
+      
+      
+        <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+      
+    </div>
+    
+  </div>
+  
+  
+    <div class="navbar-persistent--mobile">
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+    </div>
+  
+
+  
+    <button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
+      <span class="fa-solid fa-outdent"></span>
+    </button>
+  
+</div>
+
+    </header>
+  
+
+  <div class="bd-container">
+    <div class="bd-container__inner bd-page-width">
+      
+      
+      
+      <dialog id="pst-primary-sidebar-modal"></dialog>
+      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
+        
+
+
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a>
+
+
+  
+  <div class="sidebar-header-items sidebar-primary__section">
+    
+    
+      <div class="sidebar-header-items__center">
+        
+          
+          
+            <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-3"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-3"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-3"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-3">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+          
+        
+      </div>
+    
+    
+    
+      <div class="sidebar-header-items__end">
+        
+          <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+        
+      </div>
+    
+  </div>
+  
+    <div class="sidebar-primary-items__start sidebar-primary__section">
+        <div class="sidebar-primary-item">
+
+
+
+<nav class="bd-docs-nav bd-links"
+     aria-label="Table of Contents">
+  <p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
+  <div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../torch.html">PyTorch Backend</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Installation</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">LLM API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Introduction</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="../examples/customization.html">LLM Common Customizations</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client.html">Curl Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/curl_completion_client.html">Curl Completion Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client.html">Genai Perf Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
+</ul>
+</details></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
+
+
+
+<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Advanced</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
+<li class="toctree-l1"><a class="reference internal" href="gpt-runtime.html">C++ GPT Runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="executor.html">Executor API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
+<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
+<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../performance/performance-tuning-guide/index.html">Performance Tuning Guide</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/benchmarking-default-performance.html">Benchmarking Default Performance</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-build-time-flags.html">Useful Build-Time Flags</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html">Tuning Max Batch Size and Max Num Tokens</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/deciding-model-sharding-strategy.html">Deciding Model Sharding Strategy</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/fp8-quantization.html">FP8 Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-runtime-flags.html">Useful Runtime Options</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
+</ul>
+</div>
+</nav></div>
+    </div>
+  
+  
+  <div class="sidebar-primary-items__end sidebar-primary__section">
+  </div>
+
+
+
+      </div>
+      
+      <main id="main-content" class="bd-main" role="main">
+        
+        
+          <div class="bd-content">
+            <div class="bd-article-container">
+              
+              <div class="bd-header-article d-print-none">
+<div class="header-article-items header-article__inner">
+  
+    <div class="header-article-items__start">
+      
+        <div class="header-article-item">
+
+<nav aria-label="Breadcrumb" class="d-print-none">
+  <ul class="bd-breadcrumbs">
+    
+    <li class="breadcrumb-item breadcrumb-home">
+      <a href="../index.html" class="nav-link" aria-label="Home">
+        <i class="fa-solid fa-home"></i>
+      </a>
+    </li>
+    <li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Low-Precision-AllReduce</span></li>
+  </ul>
+</nav>
+</div>
+      
+    </div>
+  
+  
+</div>
+</div>
+              
+              
+              
+                
+<div id="searchbox"></div>
+                <article class="bd-article">
+                  
+  <section id="low-precision-allreduce">
+<h1>Low-Precision-AllReduce<a class="headerlink" href="#low-precision-allreduce" title="Link to this heading">#</a></h1>
+<div class="admonition note">
+<p class="admonition-title">Note</p>
+<p>Note:
+This feature is optimized for PCIe-based GPU topologies and may affect model accuracy. Please evaluate precision impact for your specific workload.</p>
+</div>
+<p>TRT-LLM supports <code class="docutils literal notranslate"><span class="pre">low-precision-allreduce</span></code>, a communication optimization that accelerates AllReduce operations in PCIe-based GPU environments. This feature quantizes FP16/BF16 data to FP8 during network transmission, reducing communication volume and improving performance.</p>
+<section id="algorithm">
+<h2>Algorithm<a class="headerlink" href="#algorithm" title="Link to this heading">#</a></h2>
+<p>The Low-Precision-AllReduce algorithm works by:</p>
+<ol class="arabic">
+<li><p>Quantizing input FP16/BF16 tensors to FP8 format before network transmission</p>
+<p><strong>Quantization details</strong>: We use a “per-warp” quantization approach where each CUDA warp (32 threads) processes a batch of data. In each warp, 31 threads quantize FP16/BF16 values to FP8 e4m3 format (16 bytes per thread), while the last thread transmits a scalar value. This results in each warp collectively quantizing 496 elements plus one scalar at a time.</p>
+</li>
+<li><p>Transmitting the quantized data through the network</p></li>
+<li><p>Dequantizing received data back to the original precision</p></li>
+<li><p>Performing the reduction operation</p></li>
+</ol>
+<p>In 8-GPU scenarios, this approach shifts the communication bottleneck from cross-NUMA QPI to the PCIe switch, resulting in better overall performance.</p>
+</section>
+<section id="topology-requirements">
+<h2>Topology Requirements<a class="headerlink" href="#topology-requirements" title="Link to this heading">#</a></h2>
+<p><img alt="8x L20/L40s Node Architecture" src="../_images/8x_l20_L40S_node_architecture.png" /></p>
+<p>Low-Precision-AllReduce is specifically designed for the topology shown above, where:</p>
+<ul class="simple">
+<li><p>Each node contains 2 NUMA domains</p></li>
+<li><p>Each NUMA domain has 4 GPUs connected via PCIe switch</p></li>
+<li><p>GPUs within the same NUMA node communicate via the PCIe switch</p></li>
+</ul>
+<p><strong>Important:</strong> This optimization will not accelerate performance in different topologies (e.g., where each GPU is in a separate NUMA domain).</p>
+</section>
+<section id="usage">
+<h2>Usage<a class="headerlink" href="#usage" title="Link to this heading">#</a></h2>
+<p>The Low-Precision-AllReduce algorithm can be enabled in two ways:</p>
+<ol class="arabic simple">
+<li><p><strong>Direct specification</strong> in your code:</p></li>
+</ol>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">AllReduce</span> <span class="n">allreduce</span><span class="p">(</span><span class="n">mapping</span><span class="o">=</span><span class="n">mapping</span><span class="p">,</span> <span class="n">strategy</span><span class="o">=</span><span class="n">AllReduceStrategy</span><span class="o">.</span><span class="n">LOWPRECISION</span><span class="p">);</span>
+</pre></div>
+</div>
+<ol class="arabic simple" start="2">
+<li><p><strong>Environment variable control</strong> with AUTO strategy:</p></li>
+</ol>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="o">//</span> <span class="n">In</span> <span class="n">your</span> <span class="n">code</span>
+<span class="n">AllReduce</span> <span class="n">allreduce</span><span class="p">(</span><span class="n">mapping</span><span class="o">=</span><span class="n">mapping</span><span class="p">,</span> <span class="n">strategy</span><span class="o">=</span><span class="n">AllReduceStrategy</span><span class="o">.</span><span class="n">AUTO</span><span class="p">);</span>
+<span class="o">//</span> <span class="n">Set</span> <span class="n">environment</span> <span class="n">variable</span> <span class="n">before</span> <span class="n">running</span>
+<span class="n">export</span> <span class="n">FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY</span><span class="o">=</span><span class="mi">1</span>
+</pre></div>
+</div>
+</section>
+<section id="performance-and-accuracy-considerations">
+<h2>Performance and Accuracy Considerations<a class="headerlink" href="#performance-and-accuracy-considerations" title="Link to this heading">#</a></h2>
+<p>Low-Precision-AllReduce reduces communication volume by using FP8 data format for transmission. This optimization:</p>
+<ul class="simple">
+<li><p>Improves performance for large message sizes in PCIe-based topologies</p></li>
+<li><p>May slightly reduce numerical precision</p></li>
+<li><p>Automatically falls back to other strategies when no performance benefit is expected (e.g., with NVLink or small messages)</p></li>
+</ul>
+<p>Users should evaluate the precision impact on their specific models and workloads.</p>
+</section>
+<section id="environment-variables">
+<h2>Environment Variables<a class="headerlink" href="#environment-variables" title="Link to this heading">#</a></h2>
+<ul class="simple">
+<li><p><code class="docutils literal notranslate"><span class="pre">FORCE_LOW_PRECISION_ALL_REDUCE_STRATEGY</span></code>: When set to <code class="docutils literal notranslate"><span class="pre">1</span></code>, forces the use of low-precision algorithm with AUTO strategy. If the algorithm determines it cannot provide performance benefits, it will automatically fall back to other strategies.</p></li>
+</ul>
+<p><strong>Note</strong>: When compiling TensorRT-LLM without enabling the <code class="docutils literal notranslate"><span class="pre">ENABLE_FP8</span></code> option, setting Low Precision allreduce will not take effect.</p>
+</section>
+</section>
+
+
+                </article>
+              
+              
+              
+              
+              
+                <footer class="prev-next-footer d-print-none">
+                  
+<div class="prev-next-area">
+</div>
+                </footer>
+              
+            </div>
+            
+            
+
+
+              
+                <dialog id="pst-secondary-sidebar-modal"></dialog>
+                <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
+
+
+  <div class="sidebar-secondary-item">
+<div
+    id="pst-page-navigation-heading-2"
+    class="page-toc tocsection onthispage">
+    <i class="fa-solid fa-list"></i> On this page
+  </div>
+  <nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
+    <ul class="visible nav section-nav flex-column">
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#algorithm">Algorithm</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#topology-requirements">Topology Requirements</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#usage">Usage</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#performance-and-accuracy-considerations">Performance and Accuracy Considerations</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#environment-variables">Environment Variables</a></li>
+</ul>
+  </nav></div>
+
+</div></div>
+              
+            
+
+          </div>
+          <footer class="bd-footer-content">
+            
+          </footer>
+        
+      </main>
+    </div>
+  </div>
+  
+  <!-- Scripts loaded after <body> so the DOM is not blocked -->
+  <script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
+<script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
+
+  <footer class="bd-footer">
+<div class="bd-footer__inner bd-page-width">
+  
+    <div class="footer-items__start">
+      
+        <div class="footer-item">
+<a class="footer-brand logo" href="https://www.nvidia.com">
+  <img src="../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
+  <img src="../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
+</a></div>
+      
+        <div class="footer-item">
+
+<div class="footer-links">
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Manage My Privacy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/preferences/start/">Do Not Sell or Share My Data</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>
+  
+  
+  
+</div>
+</div>
+      
+        <div class="footer-item">
+
+
+
+
+  <p class="copyright">
+    
+      Copyright © 2025, NVidia.
+      <br/>
+    
+  </p>
+</div>
+      
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
+    </div>
+  
+  
+  
+</div>
+
+  </footer>
+  </body>
+</html>
\ No newline at end of file
diff --git a/latest/advanced/speculative-decoding.html b/latest/advanced/speculative-decoding.html
index 74b962fcce..06fcd229ad 100644
--- a/latest/advanced/speculative-decoding.html
+++ b/latest/advanced/speculative-decoding.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -832,6 +836,15 @@ However, similar to any new model, you can follow the same approach to define yo
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/advanced/weight-streaming.html b/latest/advanced/weight-streaming.html
index 2517abf465..a6c1605c91 100644
--- a/latest/advanced/weight-streaming.html
+++ b/latest/advanced/weight-streaming.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -681,6 +685,15 @@ python3<span class="w"> </span>examples/summarize.py<span class="w"> </span><spa
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/architecture/add-model.html b/latest/architecture/add-model.html
index 5589696f51..0eb4c03b31 100644
--- a/latest/architecture/add-model.html
+++ b/latest/architecture/add-model.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -743,6 +747,15 @@ python<span class="w"> </span>../summarize.py<span class="w"> </span>--engine_di
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/architecture/checkpoint.html b/latest/architecture/checkpoint.html
index 57cc4154f4..d56c1bb644 100644
--- a/latest/architecture/checkpoint.html
+++ b/latest/architecture/checkpoint.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1010,6 +1014,15 @@ trtllm-build<span class="w"> </span>--checkpoint_dir<span class="w"> </span>./op
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/architecture/core-concepts.html b/latest/architecture/core-concepts.html
index 6ab992748c..0b308ed822 100644
--- a/latest/architecture/core-concepts.html
+++ b/latest/architecture/core-concepts.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1014,6 +1018,15 @@ srun<span class="w"> </span><span class="se">\</span>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/architecture/model-weights-loader.html b/latest/architecture/model-weights-loader.html
index d232c05488..bc27b875a5 100644
--- a/latest/architecture/model-weights-loader.html
+++ b/latest/architecture/model-weights-loader.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -932,6 +936,15 @@ The support for Qwen-1 is in <code class="docutils literal notranslate"><span cl
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/architecture/overview.html b/latest/architecture/overview.html
index a8e2489ca9..2000e1863a 100644
--- a/latest/architecture/overview.html
+++ b/latest/architecture/overview.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -661,6 +665,15 @@ Server</a> to easily create web-based services for LLMs. TensorRT-LLM supports m
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/architecture/workflow.html b/latest/architecture/workflow.html
index 83b9ea3018..98d35d1308 100644
--- a/latest/architecture/workflow.html
+++ b/latest/architecture/workflow.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -840,6 +844,15 @@ The usage of this API looks like this:</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html b/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html
index 0e66b9506f..f3c01d3f06 100644
--- a/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html
+++ b/latest/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -651,9 +655,8 @@ The command to generate synthetic dataset will be attached to the max throughput
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">YOUR_DATA_PATH</span><span class="o">=</span>&lt;your<span class="w"> </span>dataset<span class="w"> </span>file<span class="w"> </span>following<span class="w"> </span>the<span class="w"> </span>format&gt;
 
 cat<span class="w"> </span>&gt;./extra-llm-api-config.yml<span class="s">&lt;&lt;EOF</span>
-<span class="s">pytorch_backend_config:</span>
-<span class="s">    use_cuda_graph: true</span>
-<span class="s">    moe_backend: TRTLLM</span>
+<span class="s">use_cuda_graph: true</span>
+<span class="s">moe_backend: TRTLLM</span>
 <span class="s">speculative_config:</span>
 <span class="s">    decoding_type: MTP</span>
 <span class="s">    num_nextn_predict_layers: 3</span>
@@ -720,21 +723,20 @@ python<span class="w"> </span><span class="si">${</span><span class="nv">YOUR_WO
 <span class="nv">YOUR_DATA_PATH</span><span class="o">=</span>./dataset.txt
 
 cat<span class="w"> </span>&gt;./extra-llm-api-config.yml<span class="w"> </span><span class="s">&lt;&lt;EOF</span>
-<span class="s">pytorch_backend_config:</span>
-<span class="s">    use_cuda_graph: true</span>
-<span class="s">    cuda_graph_padding_enabled: true</span>
-<span class="s">    cuda_graph_batch_sizes:</span>
-<span class="s">    - 1</span>
-<span class="s">    - 2</span>
-<span class="s">    - 4</span>
-<span class="s">    - 8</span>
-<span class="s">    - 16</span>
-<span class="s">    - 32</span>
-<span class="s">    - 64</span>
-<span class="s">    - 128</span>
-<span class="s">    - 256</span>
-<span class="s">    - 384</span>
-<span class="s">    print_iter_log: true</span>
+<span class="s">use_cuda_graph: true</span>
+<span class="s">cuda_graph_padding_enabled: true</span>
+<span class="s">cuda_graph_batch_sizes:</span>
+<span class="s">- 1</span>
+<span class="s">- 2</span>
+<span class="s">- 4</span>
+<span class="s">- 8</span>
+<span class="s">- 16</span>
+<span class="s">- 32</span>
+<span class="s">- 64</span>
+<span class="s">- 128</span>
+<span class="s">- 256</span>
+<span class="s">- 384</span>
+<span class="s">print_iter_log: true</span>
 <span class="s">enable_attention_dp: true</span>
 <span class="s">EOF</span>
 
@@ -777,8 +779,7 @@ To do the benchmark, run the following command:</p>
 <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">YOUR_DATA_PATH</span><span class="o">=</span>&lt;your<span class="w"> </span>dataset<span class="w"> </span>file<span class="w"> </span>following<span class="w"> </span>the<span class="w"> </span>format&gt;
 
 cat<span class="w"> </span>&gt;./extra-llm-api-config.yml<span class="s">&lt;&lt;EOF</span>
-<span class="s">pytorch_backend_config:</span>
-<span class="s">    use_cuda_graph: true</span>
+<span class="s">use_cuda_graph: true</span>
 <span class="s">speculative_config:</span>
 <span class="s">    decoding_type: MTP</span>
 <span class="s">    num_nextn_predict_layers: 3</span>
@@ -827,10 +828,9 @@ python<span class="w"> </span><span class="si">${</span><span class="nv">YOUR_WO
 <span class="nv">YOUR_DATA_PATH</span><span class="o">=</span>./dataset.txt
 
 cat<span class="w"> </span>&gt;./extra-llm-api-config.yml<span class="s">&lt;&lt;EOF</span>
-<span class="s">pytorch_backend_config:</span>
-<span class="s">    use_cuda_graph: true</span>
-<span class="s">    cuda_graph_batch_sizes:</span>
-<span class="s">    - 128</span>
+<span class="s">use_cuda_graph: true</span>
+<span class="s">cuda_graph_batch_sizes:</span>
+<span class="s">- 128</span>
 <span class="s">enable_attention_dp: true</span>
 <span class="s">EOF</span>
 
@@ -1050,6 +1050,15 @@ For more details on <code class="docutils literal notranslate"><span class="pre"
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/blogs/Falcon180B-H200.html b/latest/blogs/Falcon180B-H200.html
index 8103ba1e30..10cab9c4b9 100644
--- a/latest/blogs/Falcon180B-H200.html
+++ b/latest/blogs/Falcon180B-H200.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -799,6 +803,15 @@ ISL = Input Sequence Length
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/blogs/H100vsA100.html b/latest/blogs/H100vsA100.html
index 7cb06537d3..4f4be35ed3 100644
--- a/latest/blogs/H100vsA100.html
+++ b/latest/blogs/H100vsA100.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -751,6 +755,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/blogs/H200launch.html b/latest/blogs/H200launch.html
index c1310bffc9..011fa0da20 100644
--- a/latest/blogs/H200launch.html
+++ b/latest/blogs/H200launch.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -743,6 +747,15 @@ TensorRT-LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. </sub></p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/blogs/XQA-kernel.html b/latest/blogs/XQA-kernel.html
index b7f986398c..a9eaca8cd0 100644
--- a/latest/blogs/XQA-kernel.html
+++ b/latest/blogs/XQA-kernel.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -710,6 +714,15 @@ ISL = Input Sequence Length
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/blogs/quantization-in-TRT-LLM.html b/latest/blogs/quantization-in-TRT-LLM.html
index a2c3c391cf..3263254481 100644
--- a/latest/blogs/quantization-in-TRT-LLM.html
+++ b/latest/blogs/quantization-in-TRT-LLM.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -865,6 +869,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html b/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html
index ee31300050..39a5ba920f 100644
--- a/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html
+++ b/latest/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html
@@ -51,18 +51,19 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
     <link rel="icon" href="../../_static/favicon.png"/>
     <link rel="index" title="Index" href="../../genindex.html" />
     <link rel="search" title="Search" href="../../search.html" />
+    <link rel="next" title="DeepSeek R1 MTP Implementation and Optimization" href="blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html" />
     <link rel="prev" title="New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget" href="../XQA-kernel.html" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -335,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -356,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -420,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -454,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1 current active"><a class="current reference internal" href="#">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -745,7 +750,7 @@
 </tr>
 <tr class="row-even"><td class="text-left"><p>Optimize Fuse_A_GEMM and Router_GEMM</p></td>
 <td class="text-center"><p>340</p></td>
-<td class="text-left"><p>WIP: <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/pull/4115">PR #4115</a></p></td>
+<td class="text-left"><p>WIP</p></td>
 </tr>
 <tr class="row-odd"><td class="text-left"><p>Relax Acceptance</p></td>
 <td class="text-center"><p><strong>368</strong></p></td>
@@ -952,7 +957,7 @@
 </section>
 <section id="routergemm">
 <h5>RouterGEMM<a class="headerlink" href="#routergemm" title="Link to this heading">#</a></h5>
-<p>By leveraging our internal AI code generator, we automatically generate an optimized RouterGEMM kernel, which delivers substantial improvements over the default GEMM implementation when <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/pull/4115/files#diff-006ae982200a5ef2b27f4aedb526025e64406d3c2fadde329ea745793fac04edR303:~:text=and%20hidden_states.-,size,-(0)">num_tokens &lt;=30</a></p>
+<p>By leveraging our internal AI code generator, we automatically generate an optimized RouterGEMM kernel, which delivers substantial improvements over the default GEMM implementation when num_tokens &lt;=30.</p>
 <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog1_router_gemm.png?raw=true" alt="tech_blog1_router_gemm" width="500" height="auto">
 </section>
 </section>
@@ -1011,6 +1016,15 @@
         <p class="prev-next-title">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</p>
       </div>
     </a>
+    <a class="right-next"
+       href="blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html"
+       title="next page">
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">next</p>
+        <p class="prev-next-title">DeepSeek R1 MTP Implementation and Optimization</p>
+      </div>
+      <i class="fa-solid fa-angle-right"></i>
+    </a>
 </div>
                 </footer>
               
@@ -1171,6 +1185,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html b/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html
new file mode 100644
index 0000000000..8bad27f901
--- /dev/null
+++ b/latest/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html
@@ -0,0 +1,946 @@
+
+
+<!DOCTYPE html>
+
+
+<html lang="en" data-content_root="../../" >
+
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+
+    <title>DeepSeek R1 MTP Implementation and Optimization &#8212; TensorRT-LLM</title>
+  
+  
+  
+  <script data-cfasync="false">
+    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
+    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
+  </script>
+  <!--
+    this give us a css class that will be invisible only if js is disabled
+  -->
+  <noscript>
+    <style>
+      .pst-js-only { display: none !important; }
+
+    </style>
+  </noscript>
+  
+  <!-- Loaded before other Sphinx assets -->
+  <link href="../../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+<link href="../../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+
+    <link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=8f2a1f02" />
+    <link rel="stylesheet" type="text/css" href="../../_static/styles/nvidia-sphinx-theme.css?v=df3ac72c" />
+    <link rel="stylesheet" type="text/css" href="../../_static/copybutton.css?v=76b2166b" />
+    <link rel="stylesheet" type="text/css" href="../../_static/autodoc_pydantic.css" />
+  
+  <!-- So that users can add custom icons -->
+  <script src="../../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
+  <!-- Pre-loaded scripts that we'll load fully later -->
+  <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
+<link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
+
+    <script src="../../_static/documentation_options.js?v=5929fcd5"></script>
+    <script src="../../_static/doctools.js?v=9a2dae69"></script>
+    <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
+    <script src="../../_static/clipboard.min.js?v=a7894cd8"></script>
+    <script src="../../_static/copybutton.js?v=65e89d2a"></script>
+    <script>DOCUMENTATION_OPTIONS.pagename = 'blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization';</script>
+    <script>
+        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
+        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
+        DOCUMENTATION_OPTIONS.show_version_warning_banner =
+            false;
+        </script>
+    <link rel="icon" href="../../_static/favicon.png"/>
+    <link rel="index" title="Index" href="../../genindex.html" />
+    <link rel="search" title="Search" href="../../search.html" />
+    <link rel="prev" title="Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs" href="blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html" />
+
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <meta name="docsearch:language" content="en"/>
+  <meta name="docsearch:version" content="0.21.0rc0" />
+
+
+  </head>
+  
+  
+  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
+
+  
+  
+  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
+  
+  <div id="pst-scroll-pixel-helper"></div>
+  
+  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
+    <i class="fa-solid fa-arrow-up"></i>Back to top</button>
+
+  
+  <dialog id="pst-search-dialog">
+    
+<form class="bd-search d-flex align-items-center"
+      action="../../search.html"
+      method="get">
+  <i class="fa-solid fa-magnifying-glass"></i>
+  <input type="search"
+         class="form-control"
+         name="q"
+         placeholder="Search the docs ..."
+         aria-label="Search the docs ..."
+         autocomplete="off"
+         autocorrect="off"
+         autocapitalize="off"
+         spellcheck="false"/>
+  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
+</form>
+  </dialog>
+
+  <div class="pst-async-banner-revealer d-none">
+  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
+</div>
+
+  
+    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
+<div class="bd-header__inner bd-page-width">
+  <button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
+    <span class="fa-solid fa-bars"></span>
+  </button>
+  
+  
+  <div class="col-lg-3 navbar-header-items__start">
+    
+      <div class="navbar-item">
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a></div>
+    
+  </div>
+  
+  <div class="col-lg-9 navbar-header-items">
+    
+    <div class="me-auto navbar-header-items__center">
+      
+        <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-2"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-2"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-2"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-2">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+      
+    </div>
+    
+    
+    <div class="navbar-header-items__end">
+      
+        <div class="navbar-item navbar-persistent--container">
+          
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+        </div>
+      
+      
+        <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+      
+    </div>
+    
+  </div>
+  
+  
+    <div class="navbar-persistent--mobile">
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+    </div>
+  
+
+  
+    <button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
+      <span class="fa-solid fa-outdent"></span>
+    </button>
+  
+</div>
+
+    </header>
+  
+
+  <div class="bd-container">
+    <div class="bd-container__inner bd-page-width">
+      
+      
+      
+      <dialog id="pst-primary-sidebar-modal"></dialog>
+      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
+        
+
+
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a>
+
+
+  
+  <div class="sidebar-header-items sidebar-primary__section">
+    
+    
+      <div class="sidebar-header-items__center">
+        
+          
+          
+            <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-3"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-3"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-3"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-3">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+          
+        
+      </div>
+    
+    
+    
+      <div class="sidebar-header-items__end">
+        
+          <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+        
+      </div>
+    
+  </div>
+  
+    <div class="sidebar-primary-items__start sidebar-primary__section">
+        <div class="sidebar-primary-item">
+
+
+
+<nav class="bd-docs-nav bd-links"
+     aria-label="Table of Contents">
+  <p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
+  <div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../quick-start-guide.html">Quick Start Guide</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../key-features.html">Key Features</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../torch.html">PyTorch Backend</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../release-notes.html">Release Notes</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Installation</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../installation/linux.html">Installing on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">LLM API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../llm-api/index.html">API Introduction</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../llm-api/reference.html">API Reference</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="../../examples/customization.html">LLM Common Customizations</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_chat_client.html">Curl Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_completion_client.html">Curl Completion Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client.html">Genai Perf Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
+</ul>
+</details></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.layers.html">Layers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.functional.html">Functionals</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../_cpp_gen/executor.html">Executor</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../_cpp_gen/runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/overview.html">TensorRT-LLM Architecture</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/core-concepts.html">Model Definition</a></li>
+
+
+
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/add-model.html">Adding a Model</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Advanced</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/executor.html">Executor API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-benchmarking.html">Benchmarking</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../performance/performance-tuning-guide/index.html">Performance Tuning Guide</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/benchmarking-default-performance.html">Benchmarking Default Performance</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/useful-build-time-flags.html">Useful Build-Time Flags</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html">Tuning Max Batch Size and Max Num Tokens</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/deciding-model-sharding-strategy.html">Deciding Model Sharding Strategy</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/fp8-quantization.html">FP8 Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/useful-runtime-flags.html">Useful Runtime Options</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-analysis.html">Performance Analysis</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../reference/troubleshooting.html">Troubleshooting</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/support-matrix.html">Support Matrix</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/precision.html">Numerical Precision</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
+<ul class="current nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1 current active"><a class="current reference internal" href="#">DeepSeek R1 MTP Implementation and Optimization</a></li>
+</ul>
+</div>
+</nav></div>
+    </div>
+  
+  
+  <div class="sidebar-primary-items__end sidebar-primary__section">
+  </div>
+
+
+
+      </div>
+      
+      <main id="main-content" class="bd-main" role="main">
+        
+        
+          <div class="bd-content">
+            <div class="bd-article-container">
+              
+              <div class="bd-header-article d-print-none">
+<div class="header-article-items header-article__inner">
+  
+    <div class="header-article-items__start">
+      
+        <div class="header-article-item">
+
+<nav aria-label="Breadcrumb" class="d-print-none">
+  <ul class="bd-breadcrumbs">
+    
+    <li class="breadcrumb-item breadcrumb-home">
+      <a href="../../index.html" class="nav-link" aria-label="Home">
+        <i class="fa-solid fa-home"></i>
+      </a>
+    </li>
+    <li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">DeepSeek R1 MTP Implementation and Optimization</span></li>
+  </ul>
+</nav>
+</div>
+      
+    </div>
+  
+  
+</div>
+</div>
+              
+              
+              
+                
+<div id="searchbox"></div>
+                <article class="bd-article">
+                  
+  <section id="deepseek-r1-mtp-implementation-and-optimization">
+<h1>DeepSeek R1 MTP Implementation and Optimization<a class="headerlink" href="#deepseek-r1-mtp-implementation-and-optimization" title="Link to this heading">#</a></h1>
+<p>by NVIDIA TensorRT-LLM team</p>
+<section id="table-of-contents">
+<h2>Table of Contents<a class="headerlink" href="#table-of-contents" title="Link to this heading">#</a></h2>
+<ul class="simple">
+<li><p><a class="reference internal" href="#mtp-for-inference">MTP for inference</a></p>
+<ul>
+<li><p><a class="reference internal" href="#background">Background</a></p></li>
+<li><p><a class="reference internal" href="#mtp-vanilla">MTP Vanilla</a></p></li>
+<li><p><a class="reference internal" href="#mtp-eagle">MTP Eagle</a></p></li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#mtp-implementation-in-tensorrt-llm">MTP implementation in TensorRT-LLM</a></p>
+<ul>
+<li><p><a class="reference internal" href="#basic-implementation">Basic Implementation</a></p></li>
+<li><p><a class="reference internal" href="#mtp-modules">MTP Modules</a></p></li>
+<li><p><a class="reference internal" href="#attention-for-mtp">Attention for MTP</a></p></li>
+<li><p><a class="reference internal" href="#how-to-run-deepseek-models-with-mtp">How to run DeepSeek models with MTP</a></p></li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#mtp-optimization-relaxed-acceptance">MTP optimization - Relaxed Acceptance</a></p>
+<ul>
+<li><p><a class="reference internal" href="#relaxed-acceptance">Relaxed Acceptance</a></p></li>
+<li><p><a class="reference internal" href="#how-to-run-the-deepseek-r1-model-with-relaxed-acceptance">How to run the DeepSeek-R1 model with Relaxed Acceptance</a></p></li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#evaluation">Evaluation</a></p>
+<ul>
+<li><p><a class="reference internal" href="#achieving-speedup-with-mtp-speculative-decoding">Achieving speedup with MTP speculative decoding</a></p></li>
+<li><p><a class="reference internal" href="#accuracy-studies-for-relaxed-acceptance">Accuracy studies for Relaxed Acceptance</a></p></li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#future-works">Future Works</a></p>
+<ul>
+<li><p><a class="reference internal" href="#tree-based-speculative-decoding-support">Tree-based speculative decoding support</a></p></li>
+<li><p><a class="reference internal" href="#eagle3-support">Eagle3 support</a></p></li>
+<li><p><a class="reference internal" href="#fix-known-issues">Fix known issues</a></p></li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#acknowledgment">Acknowledgment</a></p></li>
+</ul>
+<p>TensorRT-LLM achieves world-record inference performance for DeepSeek-R1 on NVIDIA Blackwell GPUs, where Multi-Token Prediction (MTP) delivers a significant speedup. In our <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md">previous blog post</a>, we discussed the key optimizations that enable the outstanding inference latency of the DeepSeek-R1 model. This article dives deeper into the implementation and optimization of MTP in TensorRT-LLM.</p>
+</section>
+<section id="mtp-for-inference">
+<h2>MTP for inference<a class="headerlink" href="#mtp-for-inference" title="Link to this heading">#</a></h2>
+<p>Inspired by a previous <a class="reference external" href="https://arxiv.org/pdf/2404.19737">research work</a>, MTP is designed to help the DeepSeek-V3 training. It adds additional MTP modules at the end of the main model and uses them to predict additional tokens. In this way, MTP can extend the prediction scope to multiple future tokens at each position to achieve better model accuracy. During inference, those MTP modules can also be used for speculative decoding to improve the generation latency further. In this section, we will introduce the MTP speculative decoding algorithm for LLM inference.</p>
+<section id="background">
+<h3>Background<a class="headerlink" href="#background" title="Link to this heading">#</a></h3>
+<p>Speculative decoding is a popular technique for faster and cost-effective LLM inference. It’s based on the premise that generating multiple future tokens(especially for decode phase which is less compute bound) is more efficient than processing a single token. Speculative decoding techniques usually divide the process into a low-cost draft stage and a parallelized verification stage. The draft stage predicts draft tokens by using a small model or a subset of layers in the main model. And the verification stage uses the main model to determine how many of these draft tokens to accept, which is far more efficient than generating one token per iteration.</p>
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_verify_and_accept.png" alt="tech_blog2_verify_and_accept" width="1280" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 1. Verification example</em></sub></p>
+<p>Figure 1 shows an example of how to verify and accept those draft tokens. Assuming there are a total of 5 draft tokens “ABCDE”, we will extend them to the input token “G”, and input a total of 6 tokens to the main model. After sampling, we can get six different expected tokens, then compare the expected tokens with the draft tokens and accept the longest prefix matched tokens. In this example, the tokens “ABC” are matched. Because “H” is predicted by the main model and the corresponding input token “C” is already accepted, “H” will also be accepted. In this way, we can accept four tokens in a single iteration. MTP also uses this method to verify and accept draft tokens.
+For the draft stage in MTP, there are two different MTP methods, MTP vanilla and MTP eagle. They can be used for different inference cases.</p>
+</section>
+<section id="mtp-vanilla">
+<h3>MTP Vanilla<a class="headerlink" href="#mtp-vanilla" title="Link to this heading">#</a></h3>
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_mtp_vanilla.png" alt="tech_blog2_mtp_vanilla" width="640" height="auto">
+</figure>
+</div>
+<p align="left"><sub><em>Figure 2. MTP Vanilla, where t<sub>i</sub> is the input token, d<sub>i</sub> is the predicted draft token, K is the number of MTP modules, and h<sub>i</sub><sup>n</sup> is the hidden state of the n-th MTP module. Note that h<sub>0</sub> means the hidden states of the main model.  (Disclaimer: the figures adapted from the original DeepSeek V3 tech report)</em></sub></p>
+<p>MTP Vanilla method is more similar to the MTP training, and it sequentially uses different MTP modules to predict multiple draft tokens. This method can support model checkpoints with weights of multiple different MTP modules. And each MTP module will have its own KV cache.</p>
+<p>Figure 2 illustrates the MTP vanilla inference. In the context phase, assuming there are a total of four input tokens, we will get the output token $t_5$ and the hidden states after the main model forward. The output token will be appended to the input tokens, then we shift out the first token to get tokens from $t_2$ to $t_5$ as the input tokens of the first MTP module. The hidden states from the main model will be directly used as the input of the first MTP module to predict the first draft token. For the next several MTP modules, we will use the same method to prepare the inputs to predict the sequential draft tokens.</p>
+<p>In the generation phase, there will be a little difference. The predicted token $t_5$ and the draft tokens will be used as inputs for the main model. After the main model forward, we will do the verification to get the accepted tokens. In this example, assuming $j$ draft tokens $d_6$~$d_{j+5}$ are accepted. Then prepare the MTP module inputs.  Different from the context phase, we will prepare input IDs and hidden states of a total of $K$ tokens before the last accepted token. In this example, the last accepted token is $t_{j+6}$. Then we can get the first draft token after the first MTP module forward. For the sequential MTP modules, we can prepare their inputs in a similar way to the MTP modules in the context phase, so all of those MTP modules have the same input sequence length. After predicting all of the draft tokens, we need to evict the keys/values of those rejected draft tokens from the main model’s KV cache to ensure the subsequent calculation is correct.</p>
+</section>
+<section id="mtp-eagle">
+<h3>MTP Eagle<a class="headerlink" href="#mtp-eagle" title="Link to this heading">#</a></h3>
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_mtp_eagle.png" alt="tech_blog2_mtp_eagle" width="640" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 3. MTP Eagle, using the same notation as Figure 2</em></sub></p>
+<p>MTP Eagle can be viewed as a variant of <a class="reference external" href="https://arxiv.org/pdf/2401.15077">Eagle</a> speculative decoding method, but only supports chain decoding now. It reuses the same MTP module and repeats multiple times to predict draft tokens. MTP Eagle supports the model checkpoint with only one MTP module. The official DeepSeek-V3 and DeepSeek-R1 have only one MTP module in their checkpoints. Another difference with MTP vanilla is the KV cache. In the MTP Eagle method, the MTP module reuses the same KV cache when predicting multiple draft tokens.</p>
+<p>Figure 3 gives an MTP Eagle example. In the context phase, the inputs of the first MTP module forward are the same as the MTP Vanilla. However, for the sequential MTP module forward, the first difference is that MTP Eagle uses the same MTP module to predict draft tokens and reuses the same KV cache. Another difference is that we only need to input the token ID and the hidden state of one token. The token is the last predicted draft token, while the hidden state is the corresponding hidden state in the last MTP module forward. In this way, we can predict total K draft tokens by using only one MTP module.</p>
+<p>In the generation phase, the verification stage is the same as MTP Vanilla. After getting the accepted tokens, we will use the last accepted tokens and the corresponding hidden state as the inputs of the first MTP module forward. Compared with MTP Vanilla, it will be much easier to implement. And the sequential MTP module forwards use the same method as the context phase to prepare inputs. After predicting all of the draft tokens, we need to evict the keys/values of those rejected draft tokens from the main model’s KV cache.</p>
+</section>
+</section>
+<section id="mtp-implementation-in-tensorrt-llm">
+<h2>MTP implementation in TensorRT-LLM<a class="headerlink" href="#mtp-implementation-in-tensorrt-llm" title="Link to this heading">#</a></h2>
+<section id="basic-implementation">
+<h3>Basic Implementation<a class="headerlink" href="#basic-implementation" title="Link to this heading">#</a></h3>
+<p>TensorRT-LLM has two different paths for MTP, one for <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047">MTP Vanilla</a> and another for <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047">MTP Eagle</a>. MTP Eagle is the default path for DeepSeek-V3 and DeepSeek-R1 models.</p>
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_overall_workflow.png" alt="tech_blog2_overall_workflow" width="800" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 4. MTP workflow in TensorRT-LLM</em></sub></p>
+<p>Figure 4 shows the overall workflow of MTP in TensorRT-LLM. Both paths share the runtime workflow, and the differences are in the MTP modules forward. In the context phase, there is no draft token in the inputs. TensorRT-LLM model engine fetches the input IDs from the requests and inputs to the model engine forward to get the next token and the hidden state. Then we prepare the MTP module inputs, and the MTP modules forward the inputs to predict the draft tokens.</p>
+<p>The generation workflow is more complicated. We need to do both the verification and draft stages. The predicted new token and draft tokens are the inputs for the main model. After the main model forward, we can sample from the output logits and get the following new tokens. Then compare them with the input draft tokens to get the final accepted tokens. The verification stage will be finished here. We will use the accepted tokens and hidden states to start a new draft stage, which uses the MTP layers to predict new draft tokens for the next iteration. Finally, we need to rewind the KV cache to evict keys/values corresponding to those rejected tokens.</p>
+<p>Except for the Rewind KV Cache, all of those processes are inside the model engine forward function. In this way, we can use one model engine to support MTP inference, and it would be easier for MTP to be compatible with other features, such as CUDA graph and overlap scheduler. When enabling CUDA graph, both the verification and draft stages can be captured in one graph, significantly reducing CPU overhead.</p>
+</section>
+<section id="mtp-modules">
+<h3>MTP Modules<a class="headerlink" href="#mtp-modules" title="Link to this heading">#</a></h3>
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_mtp_modules.png" alt="tech_blog2_mtp_modules" width="640" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 5. MTP model architecture</em></sub></p>
+<p>Figure 5 introduces the basic model architecture of <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/338744fba6a91147b739b7f02d19b37bc19aa17a/tensorrt_llm/_torch/speculative/mtp.py#L326">MTP Vanilla</a>, <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/338744fba6a91147b739b7f02d19b37bc19aa17a/tensorrt_llm/_torch/speculative/mtp.py#L1047">MTP Eagle</a>, and the basic <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/338744fba6a91147b739b7f02d19b37bc19aa17a/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L829">MTP module</a> design. Because MTP vanilla needs $K$ input tokens, if the number of accepted tokens is less than the number of input tokens, i.e. $j&lt;K$, we need to use the old token IDs and hidden states as the input of the first MTP module. To avoid bringing much additional computation overhead, we add two tensors for each request to save the past $K$ input IDs and the hidden states of past $K$ tokens, and update them by using the accepted tokens and corresponding hidden states each iteration. In this way, we can read these tensors when preparing inputs for the first MTP module. MTP Eagle implementation is much easier and straightforward, just call the same MTP module forward $K$ times to get $K$ new draft tokens.</p>
+<p>The MTP module follows the design in DeepSeek-V3. The embedding layer and output head in MTP modules are shared with the main model, which can save GPU memory consumption.</p>
+</section>
+<section id="attention-for-mtp">
+<h3>Attention for MTP<a class="headerlink" href="#attention-for-mtp" title="Link to this heading">#</a></h3>
+<p>Attention is also a very important component in supporting MTP inference. The changes are mainly in the attention kernels for the generation phase. For the normal request, there will be only one input token in the generation phase, but for MTP, there will be $K+1$ input tokens. Since MTP sequentially predicts additional tokens, the predicted draft tokens are chained. Though we have an MTP Eagle path, currently, we only have the chain-based support for MTP Eagle. So, a causal mask is enough for the attention kernel to support MTP. In our implementation, TensorRT-LLM will use the fp8 flashMLA generation kernel on Hopper GPU, while using TRTLLM customized attention kernels on Blackwell for better performance.</p>
+</section>
+<section id="how-to-run-deepseek-models-with-mtp">
+<h3>How to run DeepSeek models with MTP<a class="headerlink" href="#how-to-run-deepseek-models-with-mtp" title="Link to this heading">#</a></h3>
+<p>Run DeepSeek-V3/R1 models with MTP, use <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/pytorch/quickstart_advanced.py">examples/pytorch/quickstart_advanced.py</a> with additional options:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span><span class="w"> </span>examples/pytorch
+python<span class="w"> </span>quickstart_advanced.py<span class="w"> </span>--model_dir<span class="w"> </span>&lt;YOUR_MODEL_DIR&gt;<span class="w"> </span>--spec_decode_algo<span class="w"> </span>MTP<span class="w"> </span>--spec_decode_nextn<span class="w"> </span>N
+</pre></div>
+</div>
+<p>To benchmark min-latency performance with MTP, you need to follow <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/models/core/deepseek_v3/README.md#6-dataset-preparation">this document</a> to prepare your dataset, then follow the steps below:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">YOUR_DATA_PATH</span><span class="o">=</span>&lt;your<span class="w"> </span>dataset<span class="w"> </span>file<span class="w"> </span>following<span class="w"> </span>the<span class="w"> </span>format&gt;
+
+cat<span class="w"> </span>&gt;./extra-llm-api-config.yml<span class="s">&lt;&lt;EOF</span>
+<span class="s">use_cuda_graph: true</span>
+<span class="s">moe_backend: TRTLLM</span>
+<span class="s">speculative_config:</span>
+<span class="s">    decoding_type: MTP</span>
+<span class="s">    num_nextn_predict_layers: 3</span>
+<span class="s">EOF</span>
+
+<span class="nb">export</span><span class="w"> </span><span class="nv">TRTLLM_ENABLE_PDL</span><span class="o">=</span><span class="m">1</span>
+
+trtllm-bench<span class="w"> </span>--model<span class="w"> </span>nvidia/DeepSeek-R1-FP4<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>throughput<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--dataset<span class="w"> </span><span class="nv">$YOUR_DATA_PATH</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--backend<span class="w"> </span>pytorch<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--num_requests<span class="w"> </span><span class="m">10</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--concurrency<span class="w"> </span><span class="m">1</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--max_batch_size<span class="w"> </span><span class="m">1</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--tp<span class="w"> </span><span class="m">8</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--ep<span class="w"> </span><span class="m">2</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--extra_llm_api_options<span class="w"> </span>./extra-llm-api-config.yml
+</pre></div>
+</div>
+</section>
+</section>
+<section id="mtp-optimization-relaxed-acceptance">
+<h2>MTP optimization - Relaxed Acceptance<a class="headerlink" href="#mtp-optimization-relaxed-acceptance" title="Link to this heading">#</a></h2>
+<p>DeepSeek-R1 is a reasoning model that first outputs some thinking tokens, after which the user can get the actual outputs. The thinking process usually takes up a lot of tokens, and the quality of the outputs of the thinking process may have a limited impact on the final answer. So we want to use a more aggressive acceptance strategy, called <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/pull/3865">relaxed acceptance</a>, for the thinking process to speed up the thinking decoding phase. This will be a tradeoff between speedup and output quality. From the experimental results, the impact of relaxed acceptance on output quality is limited.</p>
+<section id="relaxed-acceptance">
+<h3>Relaxed Acceptance<a class="headerlink" href="#relaxed-acceptance" title="Link to this heading">#</a></h3>
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_relaxed_acceptance.png" alt="tech_blog2_relaxed_acceptance" width="1024" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 6. Relaxed Acceptance example. Use MTP nextn=4 and top-3 in this example.</em></sub></p>
+<p>In previous verification and acceptance, we will use a top-1 to sample from the logits the main model to get the “expected” tokens as shown in Figure 1. There will be only one choice to compare with the draft tokens, which we call “Strict Acceptance”.</p>
+<p>As for the Relaxed Acceptance, we first get the top-N tokens sampled from the logits, so more candidates will be compared with the input draft tokens. To make sure the accepted tokens are as accurate as possible, we also added a probability threshold, i.e., delta. We can get the token probabilities by applying a softmax to the logits. After getting the top-N tokens, we will remove tokens from the candidate list if their probability is smaller than the (top-1 probability - delta). In this way, we may get more than one token candidate, and all of those tokens are with a high probability. Then we can compare the input draft tokens with those candidates. If one of them matches, we can accept this draft token, so the acceptance rate will be increased. Figure 6 shows an example of a comparison between Strict Acceptance and Relaxed Acceptance.</p>
+<p>Note that the Relaxed Acceptance will only be used during the thinking phase, while the Strict Acceptance will still be used during the non-thinking phase. And the Relaxed Acceptance only supports the DeepSeek-R1 model now.</p>
+</section>
+<section id="how-to-run-the-deepseek-r1-model-with-relaxed-acceptance">
+<h3>How to run the DeepSeek-R1 model with Relaxed Acceptance<a class="headerlink" href="#how-to-run-the-deepseek-r1-model-with-relaxed-acceptance" title="Link to this heading">#</a></h3>
+<p>Run DeepSeek-R1 models with MTP Relaxed Acceptance, use <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/pytorch/quickstart_advanced.py">examples/pytorch/quickstart_advanced.py</a> with additional options:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span><span class="w"> </span>examples/pytorch
+python<span class="w"> </span>quickstart_advanced.py<span class="w"> </span>--model_dir<span class="w"> </span>&lt;YOUR_MODEL_DIR&gt;<span class="w"> </span>--spec_decode_algo<span class="w"> </span>MTP<span class="w"> </span>--spec_decode_nextn<span class="w"> </span>N<span class="w"> </span>--use_relaxed_acceptance_for_thinking<span class="w"> </span>--relaxed_topk<span class="w"> </span><span class="m">10</span><span class="w"> </span>--relaxed_delta<span class="w"> </span><span class="m">0</span>.6
+</pre></div>
+</div>
+<p>To benchmark min-latency performance with MTP Relaxed Acceptance, you need to follow <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/models/core/deepseek_v3/README.md#6-dataset-preparation">this document</a> to prepare your dataset, then follow the steps below:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">YOUR_DATA_PATH</span><span class="o">=</span>&lt;your<span class="w"> </span>dataset<span class="w"> </span>file<span class="w"> </span>following<span class="w"> </span>the<span class="w"> </span>format&gt;
+
+cat<span class="w"> </span>&gt;./extra-llm-api-config.yml<span class="s">&lt;&lt;EOF</span>
+<span class="s">use_cuda_graph: true</span>
+<span class="s">moe_backend: TRTLLM</span>
+<span class="s">speculative_config:</span>
+<span class="s">    decoding_type: MTP</span>
+<span class="s">    num_nextn_predict_layers: 3</span>
+<span class="s">    use_relaxed_acceptance_for_thinking: true</span>
+<span class="s">    relaxed_topk: 10</span>
+<span class="s">    relaxed_delta: 0.6</span>
+<span class="s">EOF</span>
+
+<span class="nb">export</span><span class="w"> </span><span class="nv">TRTLLM_ENABLE_PDL</span><span class="o">=</span><span class="m">1</span>
+
+trtllm-bench<span class="w"> </span>--model<span class="w"> </span>nvidia/DeepSeek-R1-FP4<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>throughput<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--dataset<span class="w"> </span><span class="nv">$YOUR_DATA_PATH</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--backend<span class="w"> </span>pytorch<span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--num_requests<span class="w"> </span><span class="m">10</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--concurrency<span class="w"> </span><span class="m">1</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--max_batch_size<span class="w"> </span><span class="m">1</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--tp<span class="w"> </span><span class="m">8</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--ep<span class="w"> </span><span class="m">2</span><span class="w"> </span><span class="se">\</span>
+<span class="w">    </span>--extra_llm_api_options<span class="w"> </span>./extra-llm-api-config.yml
+</pre></div>
+</div>
+</section>
+</section>
+<section id="evaluation">
+<h2>Evaluation<a class="headerlink" href="#evaluation" title="Link to this heading">#</a></h2>
+<section id="achieving-speedup-with-mtp-speculative-decoding">
+<h3>Achieving speedup with MTP speculative decoding<a class="headerlink" href="#achieving-speedup-with-mtp-speculative-decoding" title="Link to this heading">#</a></h3>
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_perf_and_ar.png" alt="tech_blog2_perf_and_ar" width="1280" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 7. DeepSeek-R1-FP4 671B min-latency performance with different MTP next-n</em></sub></p>
+<p>We tested the min-latency (batch size = 1) performance of the DeepSeek-R1-FP4 model with different MTP next-n on a B200 node. The MLA runs with TP=8, and the MoE runs with EP=2. And there are ten different requests with ISL/OSL=1K/2K. From Figure 7, we can see that MTP=3 can help get the best min-latency performance on 8 B200 GPUs, which can bring 2.16x speedup compared with the baseline nextn=0. And with the help of the relaxed acceptance, the min-latency performance can be further improved to achieve a 2.33x speedup. We also evaluated the CUDA graph and overlap scheduler benefits. For such a min-latency case, CUDA graph can achieve a 7.22x average speedup, while the overlap scheduler can achieve 1.03x average latency.</p>
+</section>
+<section id="accuracy-studies-for-relaxed-acceptance">
+<h3>Accuracy studies for Relaxed Acceptance<a class="headerlink" href="#accuracy-studies-for-relaxed-acceptance" title="Link to this heading">#</a></h3>
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_acc_relaxed_acceptance.png" alt="tech_blog2_acc_relaxed_acceptance" width="800" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 8. Ablation results for the Relaxed Acceptance. Using MTP nextn=3, top-10, and delta=0.6.</em></sub></p>
+<p>We validated the Relaxed Acceptance on different datasets. In Figure 8, we show the ablation results for Relaxed Acceptance by using the DeepSeek-R1-FP4 model. Compared with Strict Acceptance, the impact of Relaxed Acceptance on output quality is limited, resulting in only a slight accuracy drop.</p>
+</section>
+</section>
+<section id="future-works">
+<h2>Future Works<a class="headerlink" href="#future-works" title="Link to this heading">#</a></h2>
+<section id="tree-based-speculative-decoding-support">
+<h3>Tree-based speculative decoding support<a class="headerlink" href="#tree-based-speculative-decoding-support" title="Link to this heading">#</a></h3>
+<div align="center">
+<figure>
+  <img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_tree_spec_decoding.png" alt="tech_blog2_tree_spec_decoding" width="800" height="auto">
+</figure>
+</div>
+<p align="center"><sub><em>Figure 9. Comparison between the chain-based and tree-based speculative decoding</em></sub></p>
+<p>TensorRT-LLM PyTorch backend can only support chain-based speculative decoding now, both MTP Vanilla and MTP Eagle. However, the tree-based speculative decoding technique is widely used in previous advanced methods, such as Ealge2 and Eagle3, to increase the acceptance rate. MTPs in TensorRT-LLM can also be extended to support the tree-based technique. Figure 9 compares the chain-based method with the tree-based method. Both full tree and dynamic tree methods can help expand the candidate combinations, so that we can have more choices for the draft tokens.</p>
+</section>
+<section id="eagle3-support">
+<h3>Eagle3 support<a class="headerlink" href="#eagle3-support" title="Link to this heading">#</a></h3>
+<p>Another important method is Eagle3. From the <a class="reference external" href="https://arxiv.org/pdf/2503.01840">Eagle3 paper</a>, the promising results show that it can help greatly increase the acceptance rate by leveraging different levels’ hidden states to predict draft tokens. Since TensorRT-LLM already has <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/pull/3035">Eagle-3 support</a> now, in the future, we also want to train an Eagle3 head to support DeepSeek-V3/R1+Eagle3 to achieve better speedup.</p>
+</section>
+<section id="fix-known-issues">
+<h3>Fix known issues<a class="headerlink" href="#fix-known-issues" title="Link to this heading">#</a></h3>
+<p>There are still some known issues, and we will fix them soon:</p>
+<ul class="simple">
+<li><p>The MTP vanilla path has a known accuracy issue. We will fix it and refactor the MTP vanilla implementation.</p></li>
+<li><p>The MTP Eagle is non-deterministic now.</p></li>
+<li><p>An accuracy issue when enabling MTP and attention DP together.</p></li>
+</ul>
+</section>
+</section>
+<section id="acknowledgment">
+<h2>Acknowledgment<a class="headerlink" href="#acknowledgment" title="Link to this heading">#</a></h2>
+<p>This was a remarkable cross-team effort to support and optimize MTP in TensorRT-LLM. We would like to extend our gratitude to everyone who contributed to making this possible, as it involved a typical system/algorithm co-design approach spanning multiple technical layers—including kernel optimization, runtime enhancements, algorithmic improvements, and performance measurement &amp; analysis. And a special thanks goes to the DeepSeek team for developing the MTP method, which lays down the foundation of this blog.</p>
+</section>
+</section>
+
+
+                </article>
+              
+              
+              
+              
+              
+                <footer class="prev-next-footer d-print-none">
+                  
+<div class="prev-next-area">
+    <a class="left-prev"
+       href="blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html"
+       title="previous page">
+      <i class="fa-solid fa-angle-left"></i>
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">previous</p>
+        <p class="prev-next-title">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</p>
+      </div>
+    </a>
+</div>
+                </footer>
+              
+            </div>
+            
+            
+
+
+              
+                <dialog id="pst-secondary-sidebar-modal"></dialog>
+                <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
+
+
+  <div class="sidebar-secondary-item">
+<div
+    id="pst-page-navigation-heading-2"
+    class="page-toc tocsection onthispage">
+    <i class="fa-solid fa-list"></i> On this page
+  </div>
+  <nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
+    <ul class="visible nav section-nav flex-column">
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#table-of-contents">Table of Contents</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mtp-for-inference">MTP for inference</a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#background">Background</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mtp-vanilla">MTP Vanilla</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mtp-eagle">MTP Eagle</a></li>
+</ul>
+</li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mtp-implementation-in-tensorrt-llm">MTP implementation in TensorRT-LLM</a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#basic-implementation">Basic Implementation</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#mtp-modules">MTP Modules</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#attention-for-mtp">Attention for MTP</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#how-to-run-deepseek-models-with-mtp">How to run DeepSeek models with MTP</a></li>
+</ul>
+</li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mtp-optimization-relaxed-acceptance">MTP optimization - Relaxed Acceptance</a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#relaxed-acceptance">Relaxed Acceptance</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#how-to-run-the-deepseek-r1-model-with-relaxed-acceptance">How to run the DeepSeek-R1 model with Relaxed Acceptance</a></li>
+</ul>
+</li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#evaluation">Evaluation</a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#achieving-speedup-with-mtp-speculative-decoding">Achieving speedup with MTP speculative decoding</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#accuracy-studies-for-relaxed-acceptance">Accuracy studies for Relaxed Acceptance</a></li>
+</ul>
+</li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#future-works">Future Works</a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tree-based-speculative-decoding-support">Tree-based speculative decoding support</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#eagle3-support">Eagle3 support</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#fix-known-issues">Fix known issues</a></li>
+</ul>
+</li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#acknowledgment">Acknowledgment</a></li>
+</ul>
+  </nav></div>
+
+</div></div>
+              
+            
+
+          </div>
+          <footer class="bd-footer-content">
+            
+          </footer>
+        
+      </main>
+    </div>
+  </div>
+  
+  <!-- Scripts loaded after <body> so the DOM is not blocked -->
+  <script defer src="../../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
+<script defer src="../../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
+
+  <footer class="bd-footer">
+<div class="bd-footer__inner bd-page-width">
+  
+    <div class="footer-items__start">
+      
+        <div class="footer-item">
+<a class="footer-brand logo" href="https://www.nvidia.com">
+  <img src="../../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
+  <img src="../../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
+</a></div>
+      
+        <div class="footer-item">
+
+<div class="footer-links">
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Manage My Privacy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/preferences/start/">Do Not Sell or Share My Data</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>
+  
+  
+  
+</div>
+</div>
+      
+        <div class="footer-item">
+
+
+
+
+  <p class="copyright">
+    
+      Copyright © 2025, NVidia.
+      <br/>
+    
+  </p>
+</div>
+      
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
+    </div>
+  
+  
+  
+</div>
+
+  </footer>
+  </body>
+</html>
\ No newline at end of file
diff --git a/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html b/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html
new file mode 100644
index 0000000000..116bb0c4cb
--- /dev/null
+++ b/latest/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html
@@ -0,0 +1,904 @@
+
+
+<!DOCTYPE html>
+
+
+<html lang="en" data-content_root="../../" >
+
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+
+    <title>Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers &#8212; TensorRT-LLM</title>
+  
+  
+  
+  <script data-cfasync="false">
+    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
+    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
+  </script>
+  <!--
+    this give us a css class that will be invisible only if js is disabled
+  -->
+  <noscript>
+    <style>
+      .pst-js-only { display: none !important; }
+
+    </style>
+  </noscript>
+  
+  <!-- Loaded before other Sphinx assets -->
+  <link href="../../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+<link href="../../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+
+    <link rel="stylesheet" type="text/css" href="../../_static/pygments.css?v=8f2a1f02" />
+    <link rel="stylesheet" type="text/css" href="../../_static/styles/nvidia-sphinx-theme.css?v=df3ac72c" />
+    <link rel="stylesheet" type="text/css" href="../../_static/copybutton.css?v=76b2166b" />
+    <link rel="stylesheet" type="text/css" href="../../_static/autodoc_pydantic.css" />
+  
+  <!-- So that users can add custom icons -->
+  <script src="../../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
+  <!-- Pre-loaded scripts that we'll load fully later -->
+  <link rel="preload" as="script" href="../../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
+<link rel="preload" as="script" href="../../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
+
+    <script src="../../_static/documentation_options.js?v=5929fcd5"></script>
+    <script src="../../_static/doctools.js?v=9a2dae69"></script>
+    <script src="../../_static/sphinx_highlight.js?v=dc90522c"></script>
+    <script src="../../_static/clipboard.min.js?v=a7894cd8"></script>
+    <script src="../../_static/copybutton.js?v=65e89d2a"></script>
+    <script>DOCUMENTATION_OPTIONS.pagename = 'blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs';</script>
+    <script>
+        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
+        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
+        DOCUMENTATION_OPTIONS.show_version_warning_banner =
+            false;
+        </script>
+    <link rel="icon" href="../../_static/favicon.png"/>
+    <link rel="index" title="Index" href="../../genindex.html" />
+    <link rel="search" title="Search" href="../../search.html" />
+
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <meta name="docsearch:language" content="en"/>
+  <meta name="docsearch:version" content="0.21.0rc0" />
+
+
+  </head>
+  
+  
+  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
+
+  
+  
+  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
+  
+  <div id="pst-scroll-pixel-helper"></div>
+  
+  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
+    <i class="fa-solid fa-arrow-up"></i>Back to top</button>
+
+  
+  <dialog id="pst-search-dialog">
+    
+<form class="bd-search d-flex align-items-center"
+      action="../../search.html"
+      method="get">
+  <i class="fa-solid fa-magnifying-glass"></i>
+  <input type="search"
+         class="form-control"
+         name="q"
+         placeholder="Search the docs ..."
+         aria-label="Search the docs ..."
+         autocomplete="off"
+         autocorrect="off"
+         autocapitalize="off"
+         spellcheck="false"/>
+  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
+</form>
+  </dialog>
+
+  <div class="pst-async-banner-revealer d-none">
+  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
+</div>
+
+  
+    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
+<div class="bd-header__inner bd-page-width">
+  <button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
+    <span class="fa-solid fa-bars"></span>
+  </button>
+  
+  
+  <div class="col-lg-3 navbar-header-items__start">
+    
+      <div class="navbar-item">
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a></div>
+    
+  </div>
+  
+  <div class="col-lg-9 navbar-header-items">
+    
+    <div class="me-auto navbar-header-items__center">
+      
+        <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-2"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-2"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-2"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-2">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+      
+    </div>
+    
+    
+    <div class="navbar-header-items__end">
+      
+        <div class="navbar-item navbar-persistent--container">
+          
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+        </div>
+      
+      
+        <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+      
+    </div>
+    
+  </div>
+  
+  
+    <div class="navbar-persistent--mobile">
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+    </div>
+  
+
+  
+    <button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
+      <span class="fa-solid fa-outdent"></span>
+    </button>
+  
+</div>
+
+    </header>
+  
+
+  <div class="bd-container">
+    <div class="bd-container__inner bd-page-width">
+      
+      
+      
+      <dialog id="pst-primary-sidebar-modal"></dialog>
+      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
+        
+
+
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a>
+
+
+  
+  <div class="sidebar-header-items sidebar-primary__section">
+    
+    
+      <div class="sidebar-header-items__center">
+        
+          
+          
+            <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-3"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-3"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-3"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-3">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+          
+        
+      </div>
+    
+    
+    
+      <div class="sidebar-header-items__end">
+        
+          <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+        
+      </div>
+    
+  </div>
+  
+    <div class="sidebar-primary-items__start sidebar-primary__section">
+        <div class="sidebar-primary-item">
+
+
+
+<nav class="bd-docs-nav bd-links"
+     aria-label="Table of Contents">
+  <p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
+  <div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../quick-start-guide.html">Quick Start Guide</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../key-features.html">Key Features</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../torch.html">PyTorch Backend</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../release-notes.html">Release Notes</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Installation</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../installation/linux.html">Installing on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">LLM API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../llm-api/index.html">API Introduction</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../llm-api/reference.html">API Reference</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="../../examples/customization.html">LLM Common Customizations</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../examples/trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_chat_client.html">Curl Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/curl_completion_client.html">Curl Completion Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client.html">Genai Perf Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
+</ul>
+</details></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.layers.html">Layers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.functional.html">Functionals</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../_cpp_gen/executor.html">Executor</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../_cpp_gen/runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-build.html">trtllm-build</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../commands/trtllm-serve.html">trtllm-serve</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/overview.html">TensorRT-LLM Architecture</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/core-concepts.html">Model Definition</a></li>
+
+
+
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../architecture/add-model.html">Adding a Model</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Advanced</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/executor.html">Executor API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-benchmarking.html">Benchmarking</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../../performance/performance-tuning-guide/index.html">Performance Tuning Guide</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/benchmarking-default-performance.html">Benchmarking Default Performance</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/useful-build-time-flags.html">Useful Build-Time Flags</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html">Tuning Max Batch Size and Max Num Tokens</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/deciding-model-sharding-strategy.html">Deciding Model Sharding Strategy</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/fp8-quantization.html">FP8 Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../performance/performance-tuning-guide/useful-runtime-flags.html">Useful Runtime Options</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="../../performance/perf-analysis.html">Performance Analysis</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../../reference/troubleshooting.html">Troubleshooting</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/support-matrix.html">Support Matrix</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/precision.html">Numerical Precision</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
+</ul>
+</div>
+</nav></div>
+    </div>
+  
+  
+  <div class="sidebar-primary-items__end sidebar-primary__section">
+  </div>
+
+
+
+      </div>
+      
+      <main id="main-content" class="bd-main" role="main">
+        
+        
+          <div class="bd-content">
+            <div class="bd-article-container">
+              
+              <div class="bd-header-article d-print-none">
+<div class="header-article-items header-article__inner">
+  
+    <div class="header-article-items__start">
+      
+        <div class="header-article-item">
+
+<nav aria-label="Breadcrumb" class="d-print-none">
+  <ul class="bd-breadcrumbs">
+    
+    <li class="breadcrumb-item breadcrumb-home">
+      <a href="../../index.html" class="nav-link" aria-label="Home">
+        <i class="fa-solid fa-home"></i>
+      </a>
+    </li>
+    <li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers</span></li>
+  </ul>
+</nav>
+</div>
+      
+    </div>
+  
+  
+</div>
+</div>
+              
+              
+              
+                
+<div id="searchbox"></div>
+                <article class="bd-article">
+                  
+  <section id="optimizing-deepseek-r1-throughput-on-nvidia-blackwell-gpus-a-deep-dive-for-developers">
+<h1>Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers<a class="headerlink" href="#optimizing-deepseek-r1-throughput-on-nvidia-blackwell-gpus-a-deep-dive-for-developers" title="Link to this heading">#</a></h1>
+<p>By NVIDIA TensorRT-LLM team</p>
+<section id="table-of-contents">
+<h2>Table of Contents<a class="headerlink" href="#table-of-contents" title="Link to this heading">#</a></h2>
+<ul class="simple">
+<li><p><a class="reference internal" href="#introduction">Introduction</a></p></li>
+<li><p><a class="reference internal" href="#precision-strategy">Precision strategy</a></p></li>
+<li><p><a class="reference internal" href="#parallel-strategy">Parallel strategy</a></p>
+<ul>
+<li><p><a class="reference internal" href="#weights-absorb-and-mqa">Weights absorb and MQA</a></p></li>
+<li><p><a class="reference internal" href="#data-parallel-for-attention-module-adp">Data Parallel for Attention module (ADP)</a></p></li>
+<li><p><a class="reference internal" href="#expert-parallel-for-moe-ep">Expert parallel for MoE (EP)</a></p></li>
+</ul>
+</li>
+<li><p><a class="reference internal" href="#mla-layers-optimizations">MLA Layers Optimizations</a></p></li>
+<li><p><a class="reference internal" href="#moe-layers-optimizations">MoE Layers Optimizations</a></p></li>
+<li><p><a class="reference internal" href="#runtime-optimizations">Runtime Optimizations</a></p></li>
+<li><p><a class="reference internal" href="#how-to-reproduce">How to reproduce</a></p></li>
+<li><p><a class="reference internal" href="#future-works">Future Works</a></p></li>
+<li><p><a class="reference internal" href="#acknowledgment">Acknowledgment</a></p></li>
+</ul>
+</section>
+<section id="introduction">
+<h2>Introduction<a class="headerlink" href="#introduction" title="Link to this heading">#</a></h2>
+<p>The open source DeepSeek R1 model’s innovative architecture including the multi-head latent attention (MLA) and large sparse Mixture-of-Experts (MoE) significantly improved the inference efficiency of the LLM models. However, harnessing the full potential of such an innovative structure requires equally important hardware/software co-optimization. This post delves into the optimization strategies for DeepSeek R1 throughput oriented scenarios (TPS/GPU), developed by NVIDIA within TensorRT-LLM on NVIDIA’s Blackwell B200 GPUs. We will explore the rationale behind each enhancement. <a class="reference internal" href="blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html"><span class="std std-doc">The other min-latency optimization blog</span></a> explained in detail how TensorRT-LLM optimizes the R1 performance to achieve the best of the TPS/USER.</p>
+<p>These optimizations have significantly boosted DeepSeek R1 throughput on Blackwell. Performance increased from approximately 2000 TPS/GPU in February to 4600 TPS/GPU on ISL/OSL 1K/2K dataset. The optimizations are general and applicable to other ISL/OSL configs too. These optimization items were broadly categorized into three areas: MLA layers, MoE layers, and runtime.</p>
+</section>
+<section id="precision-strategy">
+<h2>Precision strategy<a class="headerlink" href="#precision-strategy" title="Link to this heading">#</a></h2>
+<p>The mixed precision recipe for DeepSeek R1 throughput scenario is almost the same as <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md#precision-strategy">what</a> is used for latency oriented scenario, with the following differences:</p>
+<ul class="simple">
+<li><p>FP8 KV cache and FP8 attention, rather than BF16 precision.</p></li>
+<li><p>FP4 Allgather for better communication bandwidth utilization.</p></li>
+</ul>
+<p>The checkpoint used in this blog is hosted in <a class="reference external" href="https://huggingface.co/nvidia/DeepSeek-R1-FP4">nvidia/DeepSeek-R1-FP4</a>, generated by <a class="reference external" href="https://github.com/NVIDIA/TensorRT-Model-Optimizer">NVIDIA Model Optimizer</a>. The accuracy score of common dataset on this FP4 checkpoint and TensorRT-LLM implementations are:</p>
+<div class="pst-scrollable-table-container"><table class="table">
+<thead>
+<tr class="row-odd"><th class="head text-left"><p>Precision</p></th>
+<th class="head text-left"><p>GPQA Diamond</p></th>
+<th class="head text-left"><p>MATH-500</p></th>
+</tr>
+</thead>
+<tbody>
+<tr class="row-even"><td class="text-left"><p>TensorRT-LLM FP8</p></td>
+<td class="text-left"><p>0.697</p></td>
+<td class="text-left"><p>0.954</p></td>
+</tr>
+<tr class="row-odd"><td class="text-left"><p>TensorRT-LLM FP4</p></td>
+<td class="text-left"><p>0.705</p></td>
+<td class="text-left"><p>0.96</p></td>
+</tr>
+</tbody>
+</table>
+</div>
+<p>** Note there are some run-to-run variance for these evaluations, so FP4 data is slight higher here. We think FP4 has comparable accuracy with FP8 on these datasets.</p>
+<p>The MoE layers inside this checkpoint have been quantized into FP4. Quantizing the MoE layer weights into FP4 has the following benefits:</p>
+<ul class="simple">
+<li><p>Fully utilize the 5th generation Tensor Core FLOPS of the NVIDIA Blackwell GPUs</p></li>
+<li><p>Reduce the memory load needs of the weights by almost half for MoE. Since the MoE parts are still memory bound for the decoding phase for the scenario, and 97% of the weights in the DeepSeek R1 model are from MoE layers.</p></li>
+<li><p>Reduce the memory footprint of the model weights, thus freeing more GPU memories for KV cache and then increasing the max concurrency. <a class="reference external" href="https://huggingface.co/deepseek-ai/DeepSeek-R1">The original FP8 model checkpoint of the DeepSeek R1 model</a> is about 640GB, while the NVIDIA provided <a class="reference external" href="https://huggingface.co/nvidia/DeepSeek-R1-FP4">DeepSeek R1 FP4 quantized model</a> is only about 400 GB.</p></li>
+</ul>
+<p>The precision of FP8 KV cache and FP8 attention kernels are evaluated on the GSM8K dataset, with no obvious accuracy drops. For the accuracy numbers, please see the table in the FP8 KV cache section. Users can still opt-out to use BF16 KV cache and attention if on their dataset some accuracy differences are observed.</p>
+</section>
+<section id="parallel-strategy">
+<h2>Parallel strategy<a class="headerlink" href="#parallel-strategy" title="Link to this heading">#</a></h2>
+<p>The parallelism strategy for DeepSeek R1 throughput scenario is different from what is used for latency-oriented scenarios.</p>
+<div class="pst-scrollable-table-container"><table class="table">
+<thead>
+<tr class="row-odd"><th class="head text-left"><p>Components</p></th>
+<th class="head text-left"><p>Parallel Patterns</p></th>
+</tr>
+</thead>
+<tbody>
+<tr class="row-even"><td class="text-left"><p>Attention Modules</p></td>
+<td class="text-left"><p>Data Parallelism 8 (DP8)</p></td>
+</tr>
+<tr class="row-odd"><td class="text-left"><p>MoE Sparse Experts</p></td>
+<td class="text-left"><p>Expert Parallelism 8 (EP8)</p></td>
+</tr>
+<tr class="row-even"><td class="text-left"><p>MoE Shared Experts</p></td>
+<td class="text-left"><p>DP8</p></td>
+</tr>
+<tr class="row-odd"><td class="text-left"><p>Fuse_A GEMM</p></td>
+<td class="text-left"><p>DP8</p></td>
+</tr>
+<tr class="row-even"><td class="text-left"><p>Router GEMM</p></td>
+<td class="text-left"><p>DP8</p></td>
+</tr>
+</tbody>
+</table>
+</div>
+<p>In the following sections we will explain the rationale why DP and EP are chosen and not using tensor parallel (TP).</p>
+<section id="weights-absorb-and-mqa">
+<h3>Weights absorb and MQA<a class="headerlink" href="#weights-absorb-and-mqa" title="Link to this heading">#</a></h3>
+<p>The core idea of MLA is the low-rank joint compression for the attention keys and values to reduce KV-cache size during the inference. Based on the MLA formulas, the down-projected KV latent is up-projected to multiple heads and combined with the up-projected Q to establish a normal multi-head attention (MHA). Due to the nature of the matrix multiplication, the up projection weights matrix of the K (W^UK) can be multiplied by the up-projection weights matrix of Q (W^Q) firstly, the computed results of these 2 can be then multiplied to Q. The up-projection weights matrix of V (W^UV) and the attention output projection matrix W^O can also be multiplied after the attention output. The DeepSeek-V2 technical report calls this technique “absorb”. After the weights are absorbed, the MLA is equivalent to multiple query attention(MQA). Please see the <a class="reference external" href="https://arxiv.org/pdf/2405.04434">original DeepSeek-V2 technical paper</a> for the detailed formulas and explanations, the following block diagram shows the computational flow of weights absorbed MLA in TensorRT-LLM.
+<img alt="Weights Absorb" src="../../_images/tech_blog3_mla_absorb.png" /></p>
+<p>For the decoding phase, the weights absorb significantly reduces the math FLOPS needed to up project the K and V, since the FLOPs needed for these up projections of KV are linear to the KV cache length, while length of Q vector is always 1 in the decoding phase. The longer the KV cache history is, the more FLOPs are needed, and the up projections are repeated for every decoded token since only the projected KV latent were saved, which further increases the FLOPs needed.
+For the prefill phase, the weights absorbed version changes the dimensions of Q and KV thus increasing the number of FLOPs for attention. Based on roofline analysis, non absorbed version is beneficial for the prefill phase with input length 256 or larger
+The TensorRT-LLM MLA implementation chooses different highly optimized kernels for prefill and decoding, see <a class="reference download internal" download="" href="../../_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py"><span class="xref download myst">MLA</span></a>.</p>
+</section>
+<section id="data-parallel-for-attention-module-adp">
+<h3>Data Parallel for Attention module (ADP)<a class="headerlink" href="#data-parallel-for-attention-module-adp" title="Link to this heading">#</a></h3>
+<p>The intuition of choosing attention DP is that doing TP for the MQA (where different GPUs compute different attention Q heads) will duplicate the KV cache memory, which limits the concurrency being achieved by the system. The duplication factor is equal to the TP group size, thus 8x for TP8. Small concurrency will hurt the throughput for the powerful system like NVIDIA DGX B200.</p>
+<p>For DeepSeek R1 FP4 checkpoint with 8 B200 GPUs, the weights and activation occupies about 80 GB memory for each GPU, and the free KV cache per GPU will be 100GB. Assuming ISL 1K, OSL 2K, each request will consume about 200MB KV cache, which results in a per GPU max concurrency of 500. A single node 8xGPU system has a global concurrency of 4000. When using attention TP, the global concurrency will become just 500.</p>
+<p>Silicon experiments show the attention DP technique provides a significant <strong>400% speedup</strong> in the max throughput cases, when keeping all other factors the same.</p>
+</section>
+<section id="expert-parallel-for-moe-ep">
+<h3>Expert parallel for MoE (EP)<a class="headerlink" href="#expert-parallel-for-moe-ep" title="Link to this heading">#</a></h3>
+<p>The DeepSeek R1 MoE design features 256 small sparse experts and 1 shared expert, the GEMM problem size of these experts are as follows.</p>
+<div class="pst-scrollable-table-container"><table class="table">
+<thead>
+<tr class="row-odd"><th class="head text-left"><p>GEMM</p></th>
+<th class="head text-left"><p>group</p></th>
+<th class="head text-left"><p>GEMM N</p></th>
+<th class="head text-left"><p>GEMM K</p></th>
+</tr>
+</thead>
+<tbody>
+<tr class="row-even"><td class="text-left"><p>shared_fc1</p></td>
+<td class="text-left"><p>1</p></td>
+<td class="text-left"><p>4096</p></td>
+<td class="text-left"><p>7168</p></td>
+</tr>
+<tr class="row-odd"><td class="text-left"><p>shared_fc2</p></td>
+<td class="text-left"><p>1</p></td>
+<td class="text-left"><p>7168</p></td>
+<td class="text-left"><p>2048</p></td>
+</tr>
+<tr class="row-even"><td class="text-left"><p>sparse_fc1</p></td>
+<td class="text-left"><p>256</p></td>
+<td class="text-left"><p>4096</p></td>
+<td class="text-left"><p>7168</p></td>
+</tr>
+<tr class="row-odd"><td class="text-left"><p>sparse_fc2</p></td>
+<td class="text-left"><p>256</p></td>
+<td class="text-left"><p>7168</p></td>
+<td class="text-left"><p>2048</p></td>
+</tr>
+</tbody>
+</table>
+</div>
+<p>These experts can be done in either Tensor-Parallelism or Expert-Parallelism ways. Our current ablation study reveals that Expert-Parallelism achieves better GEMM FLOPS because it has better GEMM problem sizes. And Expert-Parallelism can save GPU communication bandwidth compared to AllReduce, because the tokens only need to be sent to GPUs where the active experts for this token are located, while TP needs an AllReduce for all the tokens between all the GPUs. Also to be noted that, to scale the DeepSeek R1 inference to systems like GB200 NVL72 fully utilizing the aggregated memory bandwidth and tensor core flops, large EPs are needed. We are actively working on implementing it.</p>
+<p>Silicon performance measurements show that Expert-Parallelism can provide 142% speedup for 1K/2K max throughput case, when keeping other factors the same.</p>
+</section>
+</section>
+<section id="mla-layers-optimizations">
+<h2>MLA Layers Optimizations<a class="headerlink" href="#mla-layers-optimizations" title="Link to this heading">#</a></h2>
+<p>Other than the parallel strategy and precision strategy we explained above, we have done the following optimizations for layers/kernels inside the MLA module.</p>
+<ul>
+<li><p>Attention Kernels Optimization</p>
+<p>This provided a <strong>20% E2E speedup</strong> compared to February baseline implementation. It involved implementing <strong>high-throughput generation MLA kernels</strong>. Techniques include using 2CTA Group variant of the Tensor Core 5th MMA instructions of Blackwell GPUs, overlapping MLA with softmax using interleaved tiles, and fine-tuning kernel selection heuristics for the DeepSeek R1 problem size.</p>
+</li>
+<li><p>FP8 KV Cache</p>
+<p>An important optimization that yielded a <strong>6% E2E throughput increase</strong> when assuming the concurrency was identical. Another benefit of FP8 KV cache is <strong>compressing the KV cache size by half</strong>, which <strong>allows for larger concurrency</strong>. It also enables the use of faster FP8 attention kernels compared to BF16. We recommend that users always turn on FP8 KV cache to get better performance. In the context phase, KV is quantized to FP8 and saved to the KV cache pool. In the generation phase, both Q and KV are quantized to FP8, and FP8 Multi-Query Attention (MQA) is used. Evaluation on GSM8k showed <strong>no meaningful accuracy drop</strong>. The quantization typically uses static per-tensor FP8 with a scaling factor defaulting to 1.0, but KV cache scaling factor can also be generated by calibrating on a target dataset. Below are the accuracy metrics of different combinations on the GSM8K dataset.</p>
+<div class="pst-scrollable-table-container"><table class="table">
+<thead>
+<tr class="row-odd"><th class="head text-left"><p>KV Cache Type</p></th>
+<th class="head text-left"><p>FP8 Checkpoint</p></th>
+<th class="head text-left"><p>FP4 Checkpoint</p></th>
+</tr>
+</thead>
+<tbody>
+<tr class="row-even"><td class="text-left"><p>BF16 MLA and KV cache</p></td>
+<td class="text-left"><p>0.9629</p></td>
+<td class="text-left"><p>0.9606</p></td>
+</tr>
+<tr class="row-odd"><td class="text-left"><p>FP8 MLA and KV cache</p></td>
+<td class="text-left"><p>0.9613</p></td>
+<td class="text-left"><p>0.9606</p></td>
+</tr>
+</tbody>
+</table>
+</div>
+</li>
+<li><p>Manual GEMM tactics tuning</p>
+<p>This optimization addresses cases where the default heuristic algorithm in cuBLAS is not performing best for specific GEMM shapes existing in the model. We built an internal tool to find the best algorithm for these specific shapes offline and then used the <code class="docutils literal notranslate"><span class="pre">cublasLtMatmul</span></code> API to apply this specific, optimized algorithm at runtime. This is a necessary system optimization when general-purpose heuristics don’t find the most efficient kernel for all specific cases. We are also working actively with the cuBLAS team to further enhance the heuristics such that the best performance can always be achieved OOTB. See <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/thop/cublasScaledMM.cpp#L54">cublasScaledMM.cpp</a> for the tuning details.</p>
+</li>
+<li><p>Horizontal Fusions</p>
+<p>This involves fusing GEMM operations of down projection of Q/KV and rope dimensions of K tensor. See <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L1305">modeling_deepseekv3.py</a> for details. Horizontal fusion reduces the kernel launch overhead and increases the GEMM problem sizes which can achieve better HW utilization. It is a common technique shared by both min-latency and throughput optimizations.</p>
+</li>
+<li><p>2-stream optimizations</p>
+<p>There are some small operations which can be run in parallel like the Q norm and KV norm inside the MLA. These operations cannot fully utilize the GPU math flops and the memory bandwidth, thus running in parallel CUDA streams can bring speed-up.</p>
+</li>
+</ul>
+</section>
+<section id="moe-layers-optimizations">
+<h2>MoE Layers Optimizations<a class="headerlink" href="#moe-layers-optimizations" title="Link to this heading">#</a></h2>
+<p>The following optimizations are already done for MoE layers.</p>
+<ul>
+<li><p>Mix I/O data type for the router GEMM</p>
+<p>Achieved a <strong>4% E2E speedup</strong> by avoiding casting operations and performing the GEMM using a mixture of input and output data types (e.g., BF16 input and FP32 output) directly. This eliminates the need to explicitly cast inputs to the output type and saves memory bandwidth.</p>
+</li>
+<li><p>Top-K Kernels Fusions</p>
+<p>Resulted in a <strong>7.4% E2E speedup</strong>. For DeepSeek R1, selecting the top 8 experts from 256 is done in a two-phase approach: first selecting top groups, then finding the top 8 within those groups. DeepSeek R1 uses some additional techniques for better expert load balance which involves adding bias and scales to the topK complications. All these operations resulted in 18 PyTorch ops when not fused, see <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/models/modeling_deepseekv3.py#L213">Deepseekv3RoutingImpl</a>. Fusing the multiple kernels involved in these Top-K calculations significantly reduces the overall computation time. Compared to using 18 native PyTorch ops, fusion can reduce the operation to as few as 2 kernels. Based on the measurement on B200, fusing these kernels can reduce the kernel time from 252us to 15us in the target setting.</p>
+</li>
+<li><p>FP4 AllGather Optimizations</p>
+<p>Showed a <strong>4% E2E speedup</strong>. This optimization replaces the BF16 AllGather operation with an FP4 version. Using a lower precision for this communication primitive reduces the amount of data transferred over the network, significantly improving communication efficiency. Also, since the original BF16 Tensor to be transferred will get cast into FP4 format after the AllGather communication, this optimization will not bring any impact to the accuracy. At the kernel level, we are seeing about 3x when switching from BF16 to FP4 AllGather.</p>
+</li>
+<li><p>CUTLASS Group GEMM optimizations</p>
+<p>Provided a <strong>1.3% E2E speedup</strong>. There are some CUTLASS level optimizations shared by both min-latency and throughput cases. Just updating CUTLASS to the latest version gives us 13% kernel improvement for the MoE groupGemm, and resulted in +1.3% E2E TPS/GPU.</p>
+</li>
+<li><p>Multi-stream optimizations
+Running the shared and routed experts in 2 streams combined with other multi-streaming optimizations in the MLA modules, contributing a <strong>5.3% E2E speedup</strong>.</p></li>
+</ul>
+</section>
+<section id="runtime-optimizations">
+<h2>Runtime Optimizations<a class="headerlink" href="#runtime-optimizations" title="Link to this heading">#</a></h2>
+<p>These optimizations target the overall execution flow, scheduling, and resource management within the inference system. They are shared between DeepSeek R1 models and other models supported in the TensorRT-LLM, here we are sharing some ablation study for the performance benefits on DeepSeek R1 on B200.</p>
+<ul>
+<li><p>CUDA Graph</p>
+<p>This had a significant <strong>22% E2E performance impact</strong> for throughput scenarios. CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed. There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation. Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the <code class="docutils literal notranslate"><span class="pre">cuda_graph_padding_enabled</span></code> to false, see API here <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41">Pytorch backend config</a></p>
+</li>
+<li><p>Overlap Scheduler:</p>
+<p>Showed a <strong>4% E2E performance impact</strong> and should generally <strong>always be used</strong>. This scheduler manages the execution of different operations (like computation and communication) to overlap them effectively on the GPU and network. The intuition is to hide latency by performing computation while waiting for data transfers or vice versa, improving overall hardware utilization. The overlap schedule is already defaulted on in TensorRT-LLM by <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428#diff-3c4f29d6594b37af0f1fbb97f5291b18e49f3f2510f9d296c7adb2829e9da0bf">commit</a>. In case there are corner cases where it does not work, users can still opt-out this feature by set <em>disable_overlap_scheduler</em> to true.</p>
+</li>
+<li><p>Memory Optimizations</p>
+<p>Resulted in a <strong>4GB improvement</strong>. This includes techniques like chunked MoE (specifically for Hopper) and fixing a cuda context init bug. These methods reduce the memory footprint of the model weights or intermediate tensors, allowing for larger batch sizes or sequence lengths, and preventing Out-of-Memory (OOM) errors.</p>
+</li>
+</ul>
+</section>
+<section id="how-to-reproduce">
+<h2>How to reproduce<a class="headerlink" href="#how-to-reproduce" title="Link to this heading">#</a></h2>
+<p>See <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md#b200-max-throughput">Perf practices</a></p>
+</section>
+<section id="future-works">
+<h2>Future Works<a class="headerlink" href="#future-works" title="Link to this heading">#</a></h2>
+<ul class="simple">
+<li><p>Large EP</p></li>
+<li><p>Chunked context</p></li>
+<li><p>More communication overlap</p></li>
+</ul>
+</section>
+<section id="acknowledgment">
+<h2>Acknowledgment<a class="headerlink" href="#acknowledgment" title="Link to this heading">#</a></h2>
+<p>The substantial throughput advancements for DeepSeek R1 on Blackwell GPUs, as detailed in this post, are the fruit of a dedicated and collaborative engineering effort. Achieving nearly a 2.3x increase in TPS/GPU required a deep dive into MLA layers, MoE layers, and runtime optimizations. We extend our sincere appreciation to all the engineers involved in this intensive optimization process. Their collective expertise in pushing the boundaries of throughput performance within TensorRT-LLM has been instrumental. We trust that sharing these specific strategies for maximizing throughput will prove beneficial to the developer community as they tackle demanding LLM inference workloads on NVIDIA hardware.</p>
+</section>
+</section>
+
+
+                </article>
+              
+              
+              
+              
+              
+                <footer class="prev-next-footer d-print-none">
+                  
+<div class="prev-next-area">
+</div>
+                </footer>
+              
+            </div>
+            
+            
+
+
+              
+                <dialog id="pst-secondary-sidebar-modal"></dialog>
+                <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
+
+
+  <div class="sidebar-secondary-item">
+<div
+    id="pst-page-navigation-heading-2"
+    class="page-toc tocsection onthispage">
+    <i class="fa-solid fa-list"></i> On this page
+  </div>
+  <nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
+    <ul class="visible nav section-nav flex-column">
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#table-of-contents">Table of Contents</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#introduction">Introduction</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#precision-strategy">Precision strategy</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#parallel-strategy">Parallel strategy</a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#weights-absorb-and-mqa">Weights absorb and MQA</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#data-parallel-for-attention-module-adp">Data Parallel for Attention module (ADP)</a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#expert-parallel-for-moe-ep">Expert parallel for MoE (EP)</a></li>
+</ul>
+</li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#mla-layers-optimizations">MLA Layers Optimizations</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#moe-layers-optimizations">MoE Layers Optimizations</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#runtime-optimizations">Runtime Optimizations</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#how-to-reproduce">How to reproduce</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#future-works">Future Works</a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#acknowledgment">Acknowledgment</a></li>
+</ul>
+  </nav></div>
+
+</div></div>
+              
+            
+
+          </div>
+          <footer class="bd-footer-content">
+            
+          </footer>
+        
+      </main>
+    </div>
+  </div>
+  
+  <!-- Scripts loaded after <body> so the DOM is not blocked -->
+  <script defer src="../../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
+<script defer src="../../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
+
+  <footer class="bd-footer">
+<div class="bd-footer__inner bd-page-width">
+  
+    <div class="footer-items__start">
+      
+        <div class="footer-item">
+<a class="footer-brand logo" href="https://www.nvidia.com">
+  <img src="../../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
+  <img src="../../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
+</a></div>
+      
+        <div class="footer-item">
+
+<div class="footer-links">
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Manage My Privacy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/preferences/start/">Do Not Sell or Share My Data</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>
+  
+  
+  
+</div>
+</div>
+      
+        <div class="footer-item">
+
+
+
+
+  <p class="copyright">
+    
+      Copyright © 2025, NVidia.
+      <br/>
+    
+  </p>
+</div>
+      
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
+    </div>
+  
+  
+  
+</div>
+
+  </footer>
+  </body>
+</html>
\ No newline at end of file
diff --git a/latest/commands/trtllm-build.html b/latest/commands/trtllm-build.html
index f49eeb6e73..7748f9223a 100644
--- a/latest/commands/trtllm-build.html
+++ b/latest/commands/trtllm-build.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -529,7 +533,7 @@
                     <span class="p">[</span><span class="o">--</span><span class="n">profiling_verbosity</span> <span class="p">{</span><span class="n">layer_names_only</span><span class="p">,</span><span class="n">detailed</span><span class="p">,</span><span class="n">none</span><span class="p">}]</span>
                     <span class="p">[</span><span class="o">--</span><span class="n">strip_plan</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">weight_sparsity</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">weight_streaming</span><span class="p">]</span>
                     <span class="p">[</span><span class="o">--</span><span class="n">fast_build</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">workers</span> <span class="n">WORKERS</span><span class="p">]</span>
-                    <span class="p">[</span><span class="o">--</span><span class="n">log_level</span> <span class="p">{</span><span class="n">internal_error</span><span class="p">,</span><span class="n">error</span><span class="p">,</span><span class="n">warning</span><span class="p">,</span><span class="n">info</span><span class="p">,</span><span class="n">verbose</span><span class="p">,</span><span class="n">debug</span><span class="p">}]</span>
+                    <span class="p">[</span><span class="o">--</span><span class="n">log_level</span> <span class="p">{</span><span class="n">internal_error</span><span class="p">,</span><span class="n">error</span><span class="p">,</span><span class="n">warning</span><span class="p">,</span><span class="n">info</span><span class="p">,</span><span class="n">verbose</span><span class="p">,</span><span class="n">debug</span><span class="p">,</span><span class="n">trace</span><span class="p">}]</span>
                     <span class="p">[</span><span class="o">--</span><span class="n">enable_debug_output</span><span class="p">]</span>
                     <span class="p">[</span><span class="o">--</span><span class="n">visualize_network</span> <span class="n">VISUALIZE_NETWORK</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">dry_run</span><span class="p">]</span>
                     <span class="p">[</span><span class="o">--</span><span class="n">monitor_memory</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">logits_dtype</span> <span class="p">{</span><span class="n">float16</span><span class="p">,</span><span class="n">float32</span><span class="p">}]</span>
@@ -665,7 +669,7 @@
 <p>Default: <code class="docutils literal notranslate"><span class="pre">1</span></code></p>
 </dd>
 <dt><kbd>--log_level</kbd></dt>
-<dd><p>Possible choices: internal_error, error, warning, info, verbose, debug</p>
+<dd><p>Possible choices: internal_error, error, warning, info, verbose, debug, trace</p>
 <p>The logging level.</p>
 <p>Default: <code class="docutils literal notranslate"><span class="pre">'info'</span></code></p>
 </dd>
@@ -1049,6 +1053,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/commands/trtllm-serve.html b/latest/commands/trtllm-serve.html
index 9e7730ba21..681eef74b6 100644
--- a/latest/commands/trtllm-serve.html
+++ b/latest/commands/trtllm-serve.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -778,7 +782,7 @@ However, for the PyTorch backend, specified with the <code class="docutils liter
 <dd><p>The logging level.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Options<span class="colon">:</span></dt>
-<dd class="field-odd"><p>internal_error | error | warning | info | verbose | debug</p>
+<dd class="field-odd"><p>internal_error | error | warning | info | verbose | debug | trace</p>
 </dd>
 </dl>
 </dd></dl>
@@ -827,7 +831,7 @@ However, for the PyTorch backend, specified with the <code class="docutils liter
 <dd><p>The logging level.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Options<span class="colon">:</span></dt>
-<dd class="field-odd"><p>internal_error | error | warning | info | verbose | debug</p>
+<dd class="field-odd"><p>internal_error | error | warning | info | verbose | debug | trace</p>
 </dd>
 </dl>
 </dd></dl>
@@ -1094,6 +1098,15 @@ However, for the PyTorch backend, specified with the <code class="docutils liter
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/dev-on-cloud/build-image-to-dockerhub.html b/latest/dev-on-cloud/build-image-to-dockerhub.html
index e72ebdafa9..1525aa566f 100644
--- a/latest/dev-on-cloud/build-image-to-dockerhub.html
+++ b/latest/dev-on-cloud/build-image-to-dockerhub.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -678,6 +682,15 @@ docker<span class="w"> </span>push<span class="w"> </span>&lt;your_dockerhub_use
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/dev-on-cloud/dev-on-runpod.html b/latest/dev-on-cloud/dev-on-runpod.html
index d181b43aef..a35869e2b4 100644
--- a/latest/dev-on-cloud/dev-on-runpod.html
+++ b/latest/dev-on-cloud/dev-on-runpod.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -678,6 +682,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/curl_chat_client.html b/latest/examples/curl_chat_client.html
index b5c5eafff8..e426b4d589 100644
--- a/latest/examples/curl_chat_client.html
+++ b/latest/examples/curl_chat_client.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -649,6 +653,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/curl_chat_client_for_multimodal.html b/latest/examples/curl_chat_client_for_multimodal.html
index d088b1ccae..301ce4394e 100644
--- a/latest/examples/curl_chat_client_for_multimodal.html
+++ b/latest/examples/curl_chat_client_for_multimodal.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -726,6 +730,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/curl_completion_client.html b/latest/examples/curl_completion_client.html
index 8b84daedd3..3025854bc1 100644
--- a/latest/examples/curl_completion_client.html
+++ b/latest/examples/curl_completion_client.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -648,6 +652,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/customization.html b/latest/examples/customization.html
index 454eb74358..669753a663 100644
--- a/latest/examples/customization.html
+++ b/latest/examples/customization.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -785,6 +789,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/deepseek_r1_reasoning_parser.html b/latest/examples/deepseek_r1_reasoning_parser.html
index eee42aeeb5..e02ebada65 100644
--- a/latest/examples/deepseek_r1_reasoning_parser.html
+++ b/latest/examples/deepseek_r1_reasoning_parser.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -649,6 +653,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/genai_perf_client.html b/latest/examples/genai_perf_client.html
index 22a70a0cbb..fde5233ecc 100644
--- a/latest/examples/genai_perf_client.html
+++ b/latest/examples/genai_perf_client.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -654,6 +658,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/genai_perf_client_for_multimodal.html b/latest/examples/genai_perf_client_for_multimodal.html
index 89d6abe45b..d6aac1f037 100644
--- a/latest/examples/genai_perf_client_for_multimodal.html
+++ b/latest/examples/genai_perf_client_for_multimodal.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -657,6 +661,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/index.html b/latest/examples/index.html
index 570ccd9ac7..f4b5146a1e 100644
--- a/latest/examples/index.html
+++ b/latest/examples/index.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -544,6 +548,7 @@
 <li class="toctree-l1"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l1"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -689,6 +694,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_api_examples.html b/latest/examples/llm_api_examples.html
index c35077212c..9e2224f70c 100644
--- a/latest/examples/llm_api_examples.html
+++ b/latest/examples/llm_api_examples.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -513,6 +517,7 @@
 <li class="toctree-l1"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l1"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l1"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -653,6 +658,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_auto_parallel.html b/latest/examples/llm_auto_parallel.html
index 422d3f0b52..d23b5bcbbd 100644
--- a/latest/examples/llm_auto_parallel.html
+++ b/latest/examples/llm_auto_parallel.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -672,6 +676,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_eagle2_decoding.html b/latest/examples/llm_eagle2_decoding.html
new file mode 100644
index 0000000000..d00948ce13
--- /dev/null
+++ b/latest/examples/llm_eagle2_decoding.html
@@ -0,0 +1,717 @@
+
+
+<!DOCTYPE html>
+
+
+<html lang="en" data-content_root="../" >
+
+  <head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
+
+    <title>Generate Text Using Eagle2 Decoding &#8212; TensorRT-LLM</title>
+  
+  
+  
+  <script data-cfasync="false">
+    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
+    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
+  </script>
+  <!--
+    this give us a css class that will be invisible only if js is disabled
+  -->
+  <noscript>
+    <style>
+      .pst-js-only { display: none !important; }
+
+    </style>
+  </noscript>
+  
+  <!-- Loaded before other Sphinx assets -->
+  <link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+<link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
+
+    <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
+    <link rel="stylesheet" type="text/css" href="../_static/styles/nvidia-sphinx-theme.css?v=df3ac72c" />
+    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
+    <link rel="stylesheet" type="text/css" href="../_static/autodoc_pydantic.css" />
+  
+  <!-- So that users can add custom icons -->
+  <script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
+  <!-- Pre-loaded scripts that we'll load fully later -->
+  <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
+<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
+
+    <script src="../_static/documentation_options.js?v=5929fcd5"></script>
+    <script src="../_static/doctools.js?v=9a2dae69"></script>
+    <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
+    <script src="../_static/clipboard.min.js?v=a7894cd8"></script>
+    <script src="../_static/copybutton.js?v=65e89d2a"></script>
+    <script>DOCUMENTATION_OPTIONS.pagename = 'examples/llm_eagle2_decoding';</script>
+    <script>
+        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
+        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
+        DOCUMENTATION_OPTIONS.show_version_warning_banner =
+            false;
+        </script>
+    <link rel="icon" href="../_static/favicon.png"/>
+    <link rel="index" title="Index" href="../genindex.html" />
+    <link rel="search" title="Search" href="../search.html" />
+    <link rel="next" title="Get KV Cache Events" href="llm_inference_kv_events.html" />
+    <link rel="prev" title="Control generated text using logits processor" href="llm_logits_processor.html" />
+
+  <meta name="viewport" content="width=device-width, initial-scale=1"/>
+  <meta name="docsearch:language" content="en"/>
+  <meta name="docsearch:version" content="0.21.0rc0" />
+
+
+  </head>
+  
+  
+  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
+
+  
+  
+  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
+  
+  <div id="pst-scroll-pixel-helper"></div>
+  
+  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
+    <i class="fa-solid fa-arrow-up"></i>Back to top</button>
+
+  
+  <dialog id="pst-search-dialog">
+    
+<form class="bd-search d-flex align-items-center"
+      action="../search.html"
+      method="get">
+  <i class="fa-solid fa-magnifying-glass"></i>
+  <input type="search"
+         class="form-control"
+         name="q"
+         placeholder="Search the docs ..."
+         aria-label="Search the docs ..."
+         autocomplete="off"
+         autocorrect="off"
+         autocapitalize="off"
+         spellcheck="false"/>
+  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
+</form>
+  </dialog>
+
+  <div class="pst-async-banner-revealer d-none">
+  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
+</div>
+
+  
+    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
+<div class="bd-header__inner bd-page-width">
+  <button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
+    <span class="fa-solid fa-bars"></span>
+  </button>
+  
+  
+  <div class="col-lg-3 navbar-header-items__start">
+    
+      <div class="navbar-item">
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a></div>
+    
+  </div>
+  
+  <div class="col-lg-9 navbar-header-items">
+    
+    <div class="me-auto navbar-header-items__center">
+      
+        <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-2"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-2"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-2"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-2">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+      
+    </div>
+    
+    
+    <div class="navbar-header-items__end">
+      
+        <div class="navbar-item navbar-persistent--container">
+          
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+        </div>
+      
+      
+        <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+      
+    </div>
+    
+  </div>
+  
+  
+    <div class="navbar-persistent--mobile">
+
+<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
+ <i class="fa-solid fa-magnifying-glass"></i>
+ <span class="search-button__default-text">Search</span>
+ <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
+</button>
+    </div>
+  
+
+  
+</div>
+
+    </header>
+  
+
+  <div class="bd-container">
+    <div class="bd-container__inner bd-page-width">
+      
+      
+      
+      <dialog id="pst-primary-sidebar-modal"></dialog>
+      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
+        
+
+
+
+  
+    
+  
+
+<a class="navbar-brand logo" href="../index.html">
+  
+  
+  
+  
+  
+    
+    
+      
+    
+    
+    <img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
+    <img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
+  
+  
+    <p class="title logo__title">TensorRT-LLM</p>
+  
+</a>
+
+
+  
+  <div class="sidebar-header-items sidebar-primary__section">
+    
+    
+      <div class="sidebar-header-items__center">
+        
+          
+          
+            <div class="navbar-item">
+
+
+<div class="version-switcher__container dropdown pst-js-only">
+  <button id="pst-version-switcher-button-3"
+    type="button"
+    class="version-switcher__button btn btn-sm dropdown-toggle"
+    data-bs-toggle="dropdown"
+    aria-haspopup="listbox"
+    aria-controls="pst-version-switcher-list-3"
+    aria-label="Version switcher list"
+  >
+    Choose version  <!-- this text may get changed later by javascript -->
+    <span class="caret"></span>
+  </button>
+  <div id="pst-version-switcher-list-3"
+    class="version-switcher__menu dropdown-menu list-group-flush py-0"
+    role="listbox" aria-labelledby="pst-version-switcher-button-3">
+    <!-- dropdown will be populated by javascript on page load -->
+  </div>
+</div></div>
+          
+        
+      </div>
+    
+    
+    
+      <div class="sidebar-header-items__end">
+        
+          <div class="navbar-item">
+
+<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
+  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
+  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
+  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
+</button></div>
+        
+      </div>
+    
+  </div>
+  
+    <div class="sidebar-primary-items__start sidebar-primary__section">
+        <div class="sidebar-primary-item">
+
+
+
+<nav class="bd-docs-nav bd-links"
+     aria-label="Table of Contents">
+  <p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
+  <div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../torch.html">PyTorch Backend</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Installation</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">LLM API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Introduction</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
+<ul class="current nav bd-sidenav">
+<li class="toctree-l1 current active has-children"><a class="reference internal" href="index.html">LLM Examples Introduction</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2 current active"><a class="current reference internal" href="#">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="customization.html">LLM Common Customizations</a></li>
+<li class="toctree-l1 current active has-children"><a class="reference internal" href="llm_api_examples.html">LLM Examples</a><details open="open"><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2 current active"><a class="current reference internal" href="#">Generate Text Using Eagle2 Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text with customization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="curl_chat_client.html">Curl Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="curl_completion_client.html">Curl Completion Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
+<li class="toctree-l2"><a class="reference internal" href="genai_perf_client.html">Genai Perf Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
+<li class="toctree-l2"><a class="reference internal" href="openai_chat_client.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
+<li class="toctree-l2"><a class="reference internal" href="openai_completion_client.html">OpenAI Completion Client</a></li>
+</ul>
+</details></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
+
+
+
+<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Advanced</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/executor.html">Executor API</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
+<li class="toctree-l1 has-children"><a class="reference internal" href="../performance/performance-tuning-guide/index.html">Performance Tuning Guide</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/benchmarking-default-performance.html">Benchmarking Default Performance</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-build-time-flags.html">Useful Build-Time Flags</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html">Tuning Max Batch Size and Max Num Tokens</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/deciding-model-sharding-strategy.html">Deciding Model Sharding Strategy</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/fp8-quantization.html">FP8 Quantization</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-runtime-flags.html">Useful Runtime Options</a></li>
+</ul>
+</details></li>
+<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Reference</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
+</ul>
+<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
+<ul class="nav bd-sidenav">
+<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
+</ul>
+</div>
+</nav></div>
+    </div>
+  
+  
+  <div class="sidebar-primary-items__end sidebar-primary__section">
+  </div>
+
+
+
+      </div>
+      
+      <main id="main-content" class="bd-main" role="main">
+        
+        
+          <div class="bd-content">
+            <div class="bd-article-container">
+              
+              <div class="bd-header-article d-print-none">
+<div class="header-article-items header-article__inner">
+  
+    <div class="header-article-items__start">
+      
+        <div class="header-article-item">
+
+<nav aria-label="Breadcrumb" class="d-print-none">
+  <ul class="bd-breadcrumbs">
+    
+    <li class="breadcrumb-item breadcrumb-home">
+      <a href="../index.html" class="nav-link" aria-label="Home">
+        <i class="fa-solid fa-home"></i>
+      </a>
+    </li>
+    
+    <li class="breadcrumb-item"><a href="index.html" class="nav-link">LLM Examples Introduction</a></li>
+    
+    <li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Generate Text Using Eagle2 Decoding</span></li>
+  </ul>
+</nav>
+</div>
+      
+    </div>
+  
+  
+</div>
+</div>
+              
+              
+              
+                
+<div id="searchbox"></div>
+                <article class="bd-article">
+                  
+  <section id="generate-text-using-eagle2-decoding">
+<h1>Generate Text Using Eagle2 Decoding<a class="headerlink" href="#generate-text-using-eagle2-decoding" title="Link to this heading">#</a></h1>
+<p>Source <a class="github reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llm-api/llm_eagle2_decoding.py">NVIDIA/TensorRT-LLM</a>.</p>
+<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="linenos"> 1</span><span class="c1">### Generate Text Using Eagle2 Decoding</span>
+<span class="linenos"> 2</span>
+<span class="linenos"> 3</span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm</span><span class="w"> </span><span class="kn">import</span> <span class="n">LLM</span><span class="p">,</span> <span class="n">SamplingParams</span>
+<span class="linenos"> 4</span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">LLM</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">,</span> <span class="n">KvCacheConfig</span><span class="p">,</span>
+<span class="linenos"> 5</span>                                 <span class="n">SamplingParams</span><span class="p">)</span>
+<span class="linenos"> 6</span>
+<span class="linenos"> 7</span>
+<span class="linenos"> 8</span><span class="k">def</span><span class="w"> </span><span class="nf">main</span><span class="p">():</span>
+<span class="linenos"> 9</span>    <span class="c1"># Sample prompts.</span>
+<span class="linenos">10</span>    <span class="n">prompts</span> <span class="o">=</span> <span class="p">[</span>
+<span class="linenos">11</span>        <span class="s2">&quot;Hello, my name is&quot;</span><span class="p">,</span>
+<span class="linenos">12</span>        <span class="s2">&quot;The president of the United States is&quot;</span><span class="p">,</span>
+<span class="linenos">13</span>        <span class="s2">&quot;The capital of France is&quot;</span><span class="p">,</span>
+<span class="linenos">14</span>        <span class="s2">&quot;The future of AI is&quot;</span><span class="p">,</span>
+<span class="linenos">15</span>    <span class="p">]</span>
+<span class="linenos">16</span>    <span class="c1"># The end user can customize the sampling configuration with the SamplingParams class</span>
+<span class="linenos">17</span>    <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">temperature</span><span class="o">=</span><span class="mf">0.8</span><span class="p">,</span> <span class="n">top_p</span><span class="o">=</span><span class="mf">0.95</span><span class="p">)</span>
+<span class="linenos">18</span>
+<span class="linenos">19</span>    <span class="c1"># The end user can customize the kv cache configuration with the KVCache class</span>
+<span class="linenos">20</span>    <span class="n">kv_cache_config</span> <span class="o">=</span> <span class="n">KvCacheConfig</span><span class="p">(</span><span class="n">enable_block_reuse</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="linenos">21</span>
+<span class="linenos">22</span>    <span class="n">llm_kwargs</span> <span class="o">=</span> <span class="p">{}</span>
+<span class="linenos">23</span>
+<span class="linenos">24</span>    <span class="n">model</span> <span class="o">=</span> <span class="s2">&quot;lmsys/vicuna-7b-v1.3&quot;</span>
+<span class="linenos">25</span>
+<span class="linenos">26</span>    <span class="c1"># The end user can customize the eagle decoding configuration by specifying the</span>
+<span class="linenos">27</span>    <span class="c1"># speculative_model, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices</span>
+<span class="linenos">28</span>    <span class="c1"># greedy_sampling,posterior_threshold, use_dynamic_tree and dynamic_tree_max_topK</span>
+<span class="linenos">29</span>    <span class="c1"># with the EagleDecodingConfig class</span>
+<span class="linenos">30</span>
+<span class="linenos">31</span>    <span class="n">speculative_config</span> <span class="o">=</span> <span class="n">EagleDecodingConfig</span><span class="p">(</span>
+<span class="linenos">32</span>        <span class="n">speculative_model</span><span class="o">=</span><span class="s2">&quot;yuhuili/EAGLE-Vicuna-7B-v1.3&quot;</span><span class="p">,</span>
+<span class="linenos">33</span>        <span class="n">max_draft_len</span><span class="o">=</span><span class="mi">63</span><span class="p">,</span>
+<span class="linenos">34</span>        <span class="n">num_eagle_layers</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
+<span class="linenos">35</span>        <span class="n">max_non_leaves_per_layer</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
+<span class="linenos">36</span>        <span class="n">use_dynamic_tree</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+<span class="linenos">37</span>        <span class="n">dynamic_tree_max_topK</span><span class="o">=</span><span class="mi">10</span><span class="p">)</span>
+<span class="linenos">38</span>
+<span class="linenos">39</span>    <span class="n">llm</span> <span class="o">=</span> <span class="n">LLM</span><span class="p">(</span><span class="n">model</span><span class="o">=</span><span class="n">model</span><span class="p">,</span>
+<span class="linenos">40</span>              <span class="n">kv_cache_config</span><span class="o">=</span><span class="n">kv_cache_config</span><span class="p">,</span>
+<span class="linenos">41</span>              <span class="n">speculative_config</span><span class="o">=</span><span class="n">speculative_config</span><span class="p">,</span>
+<span class="linenos">42</span>              <span class="n">max_batch_size</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
+<span class="linenos">43</span>              <span class="n">max_seq_len</span><span class="o">=</span><span class="mi">1024</span><span class="p">,</span>
+<span class="linenos">44</span>              <span class="o">**</span><span class="n">llm_kwargs</span><span class="p">)</span>
+<span class="linenos">45</span>
+<span class="linenos">46</span>    <span class="n">outputs</span> <span class="o">=</span> <span class="n">llm</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span><span class="n">prompts</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
+<span class="linenos">47</span>
+<span class="linenos">48</span>    <span class="c1"># Print the outputs.</span>
+<span class="linenos">49</span>    <span class="k">for</span> <span class="n">output</span> <span class="ow">in</span> <span class="n">outputs</span><span class="p">:</span>
+<span class="linenos">50</span>        <span class="n">prompt</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">prompt</span>
+<span class="linenos">51</span>        <span class="n">generated_text</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span>
+<span class="linenos">52</span>        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Prompt: </span><span class="si">{</span><span class="n">prompt</span><span class="si">!r}</span><span class="s2">, Generated text: </span><span class="si">{</span><span class="n">generated_text</span><span class="si">!r}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="linenos">53</span>
+<span class="linenos">54</span>
+<span class="linenos">55</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
+<span class="linenos">56</span>    <span class="n">main</span><span class="p">()</span>
+</pre></div>
+</div>
+</section>
+
+
+                </article>
+              
+              
+              
+              
+              
+                <footer class="prev-next-footer d-print-none">
+                  
+<div class="prev-next-area">
+    <a class="left-prev"
+       href="llm_logits_processor.html"
+       title="previous page">
+      <i class="fa-solid fa-angle-left"></i>
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">previous</p>
+        <p class="prev-next-title">Control generated text using logits processor</p>
+      </div>
+    </a>
+    <a class="right-next"
+       href="llm_inference_kv_events.html"
+       title="next page">
+      <div class="prev-next-info">
+        <p class="prev-next-subtitle">next</p>
+        <p class="prev-next-title">Get KV Cache Events</p>
+      </div>
+      <i class="fa-solid fa-angle-right"></i>
+    </a>
+</div>
+                </footer>
+              
+            </div>
+            
+            
+
+<div class="bd-sidebar-secondary"></div>
+
+
+              
+            
+
+          </div>
+          <footer class="bd-footer-content">
+            
+          </footer>
+        
+      </main>
+    </div>
+  </div>
+  
+  <!-- Scripts loaded after <body> so the DOM is not blocked -->
+  <script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
+<script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
+
+  <footer class="bd-footer">
+<div class="bd-footer__inner bd-page-width">
+  
+    <div class="footer-items__start">
+      
+        <div class="footer-item">
+<a class="footer-brand logo" href="https://www.nvidia.com">
+  <img src="../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
+  <img src="../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
+</a></div>
+      
+        <div class="footer-item">
+
+<div class="footer-links">
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Manage My Privacy</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/preferences/start/">Do Not Sell or Share My Data</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
+   | 
+  
+  
+  
+  <a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>
+  
+  
+  
+</div>
+</div>
+      
+        <div class="footer-item">
+
+
+
+
+  <p class="copyright">
+    
+      Copyright © 2025, NVidia.
+      <br/>
+    
+  </p>
+</div>
+      
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
+    </div>
+  
+  
+  
+</div>
+
+  </footer>
+  </body>
+</html>
\ No newline at end of file
diff --git a/latest/examples/llm_eagle_decoding.html b/latest/examples/llm_eagle_decoding.html
index 05d1ac4e95..003714d8bf 100644
--- a/latest/examples/llm_eagle_decoding.html
+++ b/latest/examples/llm_eagle_decoding.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -511,8 +515,8 @@
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="linenos"> 1</span><span class="c1">### Generate Text Using Eagle Decoding</span>
 <span class="linenos"> 2</span>
 <span class="linenos"> 3</span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm</span><span class="w"> </span><span class="kn">import</span> <span class="n">LLM</span><span class="p">,</span> <span class="n">SamplingParams</span>
-<span class="linenos"> 4</span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">LLM</span><span class="p">,</span> <span class="n">BuildConfig</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">,</span>
-<span class="linenos"> 5</span>                                 <span class="n">KvCacheConfig</span><span class="p">,</span> <span class="n">SamplingParams</span><span class="p">)</span>
+<span class="linenos"> 4</span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi</span><span class="w"> </span><span class="kn">import</span> <span class="p">(</span><span class="n">LLM</span><span class="p">,</span> <span class="n">EagleDecodingConfig</span><span class="p">,</span> <span class="n">KvCacheConfig</span><span class="p">,</span>
+<span class="linenos"> 5</span>                                 <span class="n">SamplingParams</span><span class="p">)</span>
 <span class="linenos"> 6</span>
 <span class="linenos"> 7</span>
 <span class="linenos"> 8</span><span class="k">def</span><span class="w"> </span><span class="nf">main</span><span class="p">():</span>
@@ -526,51 +530,49 @@
 <span class="linenos">16</span>    <span class="c1"># The end user can customize the sampling configuration with the SamplingParams class</span>
 <span class="linenos">17</span>    <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">temperature</span><span class="o">=</span><span class="mf">0.8</span><span class="p">,</span> <span class="n">top_p</span><span class="o">=</span><span class="mf">0.95</span><span class="p">)</span>
 <span class="linenos">18</span>
-<span class="linenos">19</span>    <span class="c1"># The end user can customize the build configuration with the BuildConfig class</span>
-<span class="linenos">20</span>    <span class="n">build_config</span> <span class="o">=</span> <span class="n">BuildConfig</span><span class="p">(</span><span class="n">max_batch_size</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span> <span class="n">max_seq_len</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span>
+<span class="linenos">19</span>    <span class="c1"># The end user can customize the kv cache configuration with the KVCache class</span>
+<span class="linenos">20</span>    <span class="n">kv_cache_config</span> <span class="o">=</span> <span class="n">KvCacheConfig</span><span class="p">(</span><span class="n">enable_block_reuse</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
 <span class="linenos">21</span>
-<span class="linenos">22</span>    <span class="c1"># The end user can customize the kv cache configuration with the KVCache class</span>
-<span class="linenos">23</span>    <span class="n">kv_cache_config</span> <span class="o">=</span> <span class="n">KvCacheConfig</span><span class="p">(</span><span class="n">enable_block_reuse</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="linenos">24</span>
-<span class="linenos">25</span>    <span class="n">llm_kwargs</span> <span class="o">=</span> <span class="p">{}</span>
-<span class="linenos">26</span>
-<span class="linenos">27</span>    <span class="n">model</span> <span class="o">=</span> <span class="s2">&quot;lmsys/vicuna-7b-v1.3&quot;</span>
-<span class="linenos">28</span>
-<span class="linenos">29</span>    <span class="c1"># The end user can customize the eagle decoding configuration by specifying the</span>
-<span class="linenos">30</span>    <span class="c1"># speculative_model, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices</span>
-<span class="linenos">31</span>    <span class="c1"># greedy_sampling,posterior_threshold, use_dynamic_tree and dynamic_tree_max_topK</span>
-<span class="linenos">32</span>    <span class="c1"># with the EagleDecodingConfig class</span>
-<span class="linenos">33</span>
-<span class="linenos">34</span>    <span class="n">speculative_config</span> <span class="o">=</span> <span class="n">EagleDecodingConfig</span><span class="p">(</span>
-<span class="linenos">35</span>        <span class="n">speculative_model</span><span class="o">=</span><span class="s2">&quot;yuhuili/EAGLE-Vicuna-7B-v1.3&quot;</span><span class="p">,</span>
-<span class="linenos">36</span>        <span class="n">max_draft_len</span><span class="o">=</span><span class="mi">63</span><span class="p">,</span>
-<span class="linenos">37</span>        <span class="n">num_eagle_layers</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
-<span class="linenos">38</span>        <span class="n">max_non_leaves_per_layer</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
-<span class="linenos">39</span>                            <span class="n">eagle_choices</span><span class="o">=</span><span class="p">[[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> \
-<span class="linenos">40</span>                                            <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">7</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">7</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> \
-<span class="linenos">41</span>                                            <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">9</span><span class="p">],</span> <span class="p">[</span><span class="mi">8</span><span class="p">],</span> <span class="p">[</span><span class="mi">9</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> \
-<span class="linenos">42</span>                                            <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">5</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">7</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">9</span><span class="p">],</span> \
-<span class="linenos">43</span>                                            <span class="p">[</span><span class="mi">6</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">8</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> \
-<span class="linenos">44</span>                                            <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">9</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">0</span><span class="p">]]</span>
-<span class="linenos">45</span>    <span class="p">)</span>
-<span class="linenos">46</span>
-<span class="linenos">47</span>    <span class="n">llm</span> <span class="o">=</span> <span class="n">LLM</span><span class="p">(</span><span class="n">model</span><span class="o">=</span><span class="n">model</span><span class="p">,</span>
-<span class="linenos">48</span>              <span class="n">build_config</span><span class="o">=</span><span class="n">build_config</span><span class="p">,</span>
-<span class="linenos">49</span>              <span class="n">kv_cache_config</span><span class="o">=</span><span class="n">kv_cache_config</span><span class="p">,</span>
-<span class="linenos">50</span>              <span class="n">speculative_config</span><span class="o">=</span><span class="n">speculative_config</span><span class="p">,</span>
-<span class="linenos">51</span>              <span class="o">**</span><span class="n">llm_kwargs</span><span class="p">)</span>
+<span class="linenos">22</span>    <span class="n">llm_kwargs</span> <span class="o">=</span> <span class="p">{}</span>
+<span class="linenos">23</span>
+<span class="linenos">24</span>    <span class="n">model</span> <span class="o">=</span> <span class="s2">&quot;lmsys/vicuna-7b-v1.3&quot;</span>
+<span class="linenos">25</span>
+<span class="linenos">26</span>    <span class="c1"># The end user can customize the eagle decoding configuration by specifying the</span>
+<span class="linenos">27</span>    <span class="c1"># speculative_model, max_draft_len, num_eagle_layers, max_non_leaves_per_layer, eagle_choices</span>
+<span class="linenos">28</span>    <span class="c1"># greedy_sampling,posterior_threshold, use_dynamic_tree and dynamic_tree_max_topK</span>
+<span class="linenos">29</span>    <span class="c1"># with the EagleDecodingConfig class</span>
+<span class="linenos">30</span>
+<span class="linenos">31</span>    <span class="n">speculative_config</span> <span class="o">=</span> <span class="n">EagleDecodingConfig</span><span class="p">(</span>
+<span class="linenos">32</span>        <span class="n">speculative_model</span><span class="o">=</span><span class="s2">&quot;yuhuili/EAGLE-Vicuna-7B-v1.3&quot;</span><span class="p">,</span>
+<span class="linenos">33</span>        <span class="n">max_draft_len</span><span class="o">=</span><span class="mi">63</span><span class="p">,</span>
+<span class="linenos">34</span>        <span class="n">num_eagle_layers</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
+<span class="linenos">35</span>        <span class="n">max_non_leaves_per_layer</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span>
+<span class="linenos">36</span>                            <span class="n">eagle_choices</span><span class="o">=</span><span class="p">[[</span><span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> \
+<span class="linenos">37</span>                                            <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">7</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">7</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> \
+<span class="linenos">38</span>                                            <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">9</span><span class="p">],</span> <span class="p">[</span><span class="mi">8</span><span class="p">],</span> <span class="p">[</span><span class="mi">9</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">4</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> \
+<span class="linenos">39</span>                                            <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">5</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">3</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">7</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">8</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">9</span><span class="p">],</span> \
+<span class="linenos">40</span>                                            <span class="p">[</span><span class="mi">6</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">4</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">4</span><span class="p">],</span> <span class="p">[</span><span class="mi">7</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span> <span class="p">[</span><span class="mi">8</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> \
+<span class="linenos">41</span>                                            <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">5</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">5</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">9</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">],</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">6</span><span class="p">],</span> <span class="p">[</span><span class="mi">0</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">0</span><span class="p">]]</span>
+<span class="linenos">42</span>    <span class="p">)</span>
+<span class="linenos">43</span>
+<span class="linenos">44</span>    <span class="n">llm</span> <span class="o">=</span> <span class="n">LLM</span><span class="p">(</span><span class="n">model</span><span class="o">=</span><span class="n">model</span><span class="p">,</span>
+<span class="linenos">45</span>              <span class="n">kv_cache_config</span><span class="o">=</span><span class="n">kv_cache_config</span><span class="p">,</span>
+<span class="linenos">46</span>              <span class="n">speculative_config</span><span class="o">=</span><span class="n">speculative_config</span><span class="p">,</span>
+<span class="linenos">47</span>              <span class="n">max_batch_size</span><span class="o">=</span><span class="mi">1</span><span class="p">,</span>
+<span class="linenos">48</span>              <span class="n">max_seq_len</span><span class="o">=</span><span class="mi">1024</span><span class="p">,</span>
+<span class="linenos">49</span>              <span class="o">**</span><span class="n">llm_kwargs</span><span class="p">)</span>
+<span class="linenos">50</span>
+<span class="linenos">51</span>    <span class="n">outputs</span> <span class="o">=</span> <span class="n">llm</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span><span class="n">prompts</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
 <span class="linenos">52</span>
-<span class="linenos">53</span>    <span class="n">outputs</span> <span class="o">=</span> <span class="n">llm</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span><span class="n">prompts</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">)</span>
-<span class="linenos">54</span>
-<span class="linenos">55</span>    <span class="c1"># Print the outputs.</span>
-<span class="linenos">56</span>    <span class="k">for</span> <span class="n">output</span> <span class="ow">in</span> <span class="n">outputs</span><span class="p">:</span>
-<span class="linenos">57</span>        <span class="n">prompt</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">prompt</span>
-<span class="linenos">58</span>        <span class="n">generated_text</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span>
-<span class="linenos">59</span>        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Prompt: </span><span class="si">{</span><span class="n">prompt</span><span class="si">!r}</span><span class="s2">, Generated text: </span><span class="si">{</span><span class="n">generated_text</span><span class="si">!r}</span><span class="s2">&quot;</span><span class="p">)</span>
-<span class="linenos">60</span>
-<span class="linenos">61</span>
-<span class="linenos">62</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
-<span class="linenos">63</span>    <span class="n">main</span><span class="p">()</span>
+<span class="linenos">53</span>    <span class="c1"># Print the outputs.</span>
+<span class="linenos">54</span>    <span class="k">for</span> <span class="n">output</span> <span class="ow">in</span> <span class="n">outputs</span><span class="p">:</span>
+<span class="linenos">55</span>        <span class="n">prompt</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">prompt</span>
+<span class="linenos">56</span>        <span class="n">generated_text</span> <span class="o">=</span> <span class="n">output</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span>
+<span class="linenos">57</span>        <span class="nb">print</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Prompt: </span><span class="si">{</span><span class="n">prompt</span><span class="si">!r}</span><span class="s2">, Generated text: </span><span class="si">{</span><span class="n">generated_text</span><span class="si">!r}</span><span class="s2">&quot;</span><span class="p">)</span>
+<span class="linenos">58</span>
+<span class="linenos">59</span>
+<span class="linenos">60</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
+<span class="linenos">61</span>    <span class="n">main</span><span class="p">()</span>
 </pre></div>
 </div>
 </section>
@@ -700,6 +702,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_guided_decoding.html b/latest/examples/llm_guided_decoding.html
index a26dba34ca..16eae7f081 100644
--- a/latest/examples/llm_guided_decoding.html
+++ b/latest/examples/llm_guided_decoding.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -682,6 +686,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_inference.html b/latest/examples/llm_inference.html
index d9f141cdd6..b4cd6c2808 100644
--- a/latest/examples/llm_inference.html
+++ b/latest/examples/llm_inference.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -677,6 +681,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_inference_async.html b/latest/examples/llm_inference_async.html
index 33ce47b8d0..e03351a886 100644
--- a/latest/examples/llm_inference_async.html
+++ b/latest/examples/llm_inference_async.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -680,6 +684,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_inference_async_streaming.html b/latest/examples/llm_inference_async_streaming.html
index 32559c9259..26a6874d88 100644
--- a/latest/examples/llm_inference_async_streaming.html
+++ b/latest/examples/llm_inference_async_streaming.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -700,6 +704,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_inference_customize.html b/latest/examples/llm_inference_customize.html
index 0dfda09e0e..ceb4b1aa83 100644
--- a/latest/examples/llm_inference_customize.html
+++ b/latest/examples/llm_inference_customize.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -693,6 +697,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_inference_distributed.html b/latest/examples/llm_inference_distributed.html
index 0c5554b237..2a9eaf5915 100644
--- a/latest/examples/llm_inference_distributed.html
+++ b/latest/examples/llm_inference_distributed.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -681,6 +685,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_inference_kv_events.html b/latest/examples/llm_inference_kv_events.html
index 5dd35528a3..d0de68e3b7 100644
--- a/latest/examples/llm_inference_kv_events.html
+++ b/latest/examples/llm_inference_kv_events.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -59,11 +59,11 @@
     <link rel="index" title="Index" href="../genindex.html" />
     <link rel="search" title="Search" href="../search.html" />
     <link rel="next" title="Generate Text Using Lookahead Decoding" href="llm_lookahead_decoding.html" />
-    <link rel="prev" title="Control generated text using logits processor" href="llm_logits_processor.html" />
+    <link rel="prev" title="Generate Text Using Eagle2 Decoding" href="llm_eagle2_decoding.html" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -511,53 +515,51 @@
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="linenos"> 1</span><span class="c1">### Get KV Cache Events</span>
 <span class="linenos"> 2</span>
 <span class="linenos"> 3</span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm</span><span class="w"> </span><span class="kn">import</span> <span class="n">LLM</span><span class="p">,</span> <span class="n">SamplingParams</span>
-<span class="linenos"> 4</span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm._torch.pyexecutor.config</span><span class="w"> </span><span class="kn">import</span> <span class="n">PyTorchConfig</span>
-<span class="linenos"> 5</span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi</span><span class="w"> </span><span class="kn">import</span> <span class="n">KvCacheConfig</span>
+<span class="linenos"> 4</span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm.llmapi</span><span class="w"> </span><span class="kn">import</span> <span class="n">KvCacheConfig</span>
+<span class="linenos"> 5</span>
 <span class="linenos"> 6</span>
-<span class="linenos"> 7</span>
-<span class="linenos"> 8</span><span class="k">def</span><span class="w"> </span><span class="nf">main</span><span class="p">():</span>
-<span class="linenos"> 9</span>    <span class="n">pytorch_config</span> <span class="o">=</span> <span class="n">PyTorchConfig</span><span class="p">(</span><span class="n">autotuner_enabled</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
-<span class="linenos">10</span>                                   <span class="n">kv_cache_dtype</span><span class="o">=</span><span class="s1">&#39;auto&#39;</span><span class="p">)</span>
-<span class="linenos">11</span>
-<span class="linenos">12</span>    <span class="n">llm</span> <span class="o">=</span> <span class="n">LLM</span><span class="p">(</span><span class="n">model</span><span class="o">=</span><span class="s2">&quot;TinyLlama/TinyLlama-1.1B-Chat-v1.0&quot;</span><span class="p">,</span>
-<span class="linenos">13</span>              <span class="n">tensor_parallel_size</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
-<span class="linenos">14</span>              <span class="n">pytorch_backend_config</span><span class="o">=</span><span class="n">pytorch_config</span><span class="p">,</span>
-<span class="linenos">15</span>              <span class="n">kv_cache_config</span><span class="o">=</span><span class="n">KvCacheConfig</span><span class="p">(</span><span class="n">enable_block_reuse</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
-<span class="linenos">16</span>                                            <span class="n">event_buffer_max_size</span><span class="o">=</span><span class="mi">1024</span><span class="p">),</span>
-<span class="linenos">17</span>              <span class="n">backend</span><span class="o">=</span><span class="s2">&quot;pytorch&quot;</span><span class="p">)</span>
-<span class="linenos">18</span>
-<span class="linenos">19</span>    <span class="c1"># Sample prompts having a common prefix.</span>
-<span class="linenos">20</span>    <span class="n">common_prefix</span> <span class="o">=</span> <span class="p">(</span>
-<span class="linenos">21</span>        <span class="s2">&quot;After the ghost&#39;s departure, Barnardo notes Horatio&#39;s pale appearance and asks if he&#39;s okay. &quot;</span>
-<span class="linenos">22</span>        <span class="s2">&quot;Horatio concedes that he&#39;s shaken and confesses that, without witnessing the ghost himself, he wouldn&#39;t have believed it existed. &quot;</span>
-<span class="linenos">23</span>        <span class="s2">&quot;He&#39;s also disturbed by the ghost&#39;s striking resemblance to the king. It even seems to be wearing the former king&#39;s armor. &quot;</span>
-<span class="linenos">24</span>        <span class="s2">&quot;Horatio thinks the ghost&#39;s presence foretells that something is about to go wrong in Denmark. &quot;</span>
-<span class="linenos">25</span>        <span class="s2">&quot;Marcellus concurs with Horatio, as he and the other guards have observed that their schedules have become more rigorous and have also noticed the preparations taking place within Elsinore, including the building of cannons, the storing of weapons, and the preparation of ships.&quot;</span>
-<span class="linenos">26</span>    <span class="p">)</span>
-<span class="linenos">27</span>    <span class="n">prompts</span> <span class="o">=</span> <span class="p">[</span>
-<span class="linenos">28</span>        <span class="n">common_prefix</span><span class="p">,</span> <span class="n">common_prefix</span> <span class="o">+</span> <span class="s2">&quot; Marcellus also notes that the king&#39;s&quot;</span>
-<span class="linenos">29</span>    <span class="p">]</span>
-<span class="linenos">30</span>
-<span class="linenos">31</span>    <span class="c1"># Create a sampling params.</span>
-<span class="linenos">32</span>    <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">temperature</span><span class="o">=</span><span class="mf">0.001</span><span class="p">,</span>
-<span class="linenos">33</span>                                     <span class="n">top_p</span><span class="o">=</span><span class="mf">0.001</span><span class="p">,</span>
-<span class="linenos">34</span>                                     <span class="n">max_tokens</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
-<span class="linenos">35</span>
-<span class="linenos">36</span>    <span class="k">for</span> <span class="n">output</span> <span class="ow">in</span> <span class="n">llm</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span><span class="n">prompts</span><span class="p">,</span> <span class="n">sampling_params</span><span class="o">=</span><span class="n">sampling_params</span><span class="p">):</span>
-<span class="linenos">37</span>        <span class="nb">print</span><span class="p">(</span>
-<span class="linenos">38</span>            <span class="sa">f</span><span class="s2">&quot;Prompt: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">prompt</span><span class="si">!r}</span><span class="s2">, Generated text: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span><span class="si">!r}</span><span class="s2">&quot;</span>
-<span class="linenos">39</span>        <span class="p">)</span>
-<span class="linenos">40</span>
-<span class="linenos">41</span>    <span class="n">kv_events</span> <span class="o">=</span> <span class="n">llm</span><span class="o">.</span><span class="n">get_kv_cache_events</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
-<span class="linenos">42</span>    <span class="nb">print</span><span class="p">(</span><span class="n">kv_events</span><span class="p">)</span>
-<span class="linenos">43</span>
-<span class="linenos">44</span>    <span class="c1"># Got output like follows:</span>
-<span class="linenos">45</span>    <span class="c1"># [{&#39;event_id&#39;: 0, &#39;data&#39;: {&#39;type&#39;: &#39;created&#39;, &#39;num_blocks_per_cache_level&#39;: [101230, 0]}},</span>
-<span class="linenos">46</span>    <span class="c1">#  {&#39;event_id&#39;: 1, &#39;data&#39;: {&#39;type&#39;: &#39;stored&#39;, &#39;parent_hash&#39;: None, &#39;blocks&#39;: [{&#39;type&#39;: &#39;stored_block&#39;, &#39;block_hash&#39;: 4203099703668305365, &#39;tokens&#39;: [{&#39;type&#39;: &#39;unique_token&#39;, &#39;token_id&#39;: 1, &#39;token_extra_id&#39;: 0}, ...</span>
-<span class="linenos">47</span>
-<span class="linenos">48</span>
-<span class="linenos">49</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
-<span class="linenos">50</span>    <span class="n">main</span><span class="p">()</span>
+<span class="linenos"> 7</span><span class="k">def</span><span class="w"> </span><span class="nf">main</span><span class="p">():</span>
+<span class="linenos"> 8</span>
+<span class="linenos"> 9</span>    <span class="n">llm</span> <span class="o">=</span> <span class="n">LLM</span><span class="p">(</span><span class="n">model</span><span class="o">=</span><span class="s2">&quot;TinyLlama/TinyLlama-1.1B-Chat-v1.0&quot;</span><span class="p">,</span>
+<span class="linenos">10</span>              <span class="n">tensor_parallel_size</span><span class="o">=</span><span class="mi">2</span><span class="p">,</span>
+<span class="linenos">11</span>              <span class="n">autotuner_enabled</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
+<span class="linenos">12</span>              <span class="n">kv_cache_dtype</span><span class="o">=</span><span class="s1">&#39;auto&#39;</span><span class="p">,</span>
+<span class="linenos">13</span>              <span class="n">kv_cache_config</span><span class="o">=</span><span class="n">KvCacheConfig</span><span class="p">(</span><span class="n">enable_block_reuse</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
+<span class="linenos">14</span>                                            <span class="n">event_buffer_max_size</span><span class="o">=</span><span class="mi">1024</span><span class="p">),</span>
+<span class="linenos">15</span>              <span class="n">backend</span><span class="o">=</span><span class="s2">&quot;pytorch&quot;</span><span class="p">)</span>
+<span class="linenos">16</span>
+<span class="linenos">17</span>    <span class="c1"># Sample prompts having a common prefix.</span>
+<span class="linenos">18</span>    <span class="n">common_prefix</span> <span class="o">=</span> <span class="p">(</span>
+<span class="linenos">19</span>        <span class="s2">&quot;After the ghost&#39;s departure, Barnardo notes Horatio&#39;s pale appearance and asks if he&#39;s okay. &quot;</span>
+<span class="linenos">20</span>        <span class="s2">&quot;Horatio concedes that he&#39;s shaken and confesses that, without witnessing the ghost himself, he wouldn&#39;t have believed it existed. &quot;</span>
+<span class="linenos">21</span>        <span class="s2">&quot;He&#39;s also disturbed by the ghost&#39;s striking resemblance to the king. It even seems to be wearing the former king&#39;s armor. &quot;</span>
+<span class="linenos">22</span>        <span class="s2">&quot;Horatio thinks the ghost&#39;s presence foretells that something is about to go wrong in Denmark. &quot;</span>
+<span class="linenos">23</span>        <span class="s2">&quot;Marcellus concurs with Horatio, as he and the other guards have observed that their schedules have become more rigorous and have also noticed the preparations taking place within Elsinore, including the building of cannons, the storing of weapons, and the preparation of ships.&quot;</span>
+<span class="linenos">24</span>    <span class="p">)</span>
+<span class="linenos">25</span>    <span class="n">prompts</span> <span class="o">=</span> <span class="p">[</span>
+<span class="linenos">26</span>        <span class="n">common_prefix</span><span class="p">,</span> <span class="n">common_prefix</span> <span class="o">+</span> <span class="s2">&quot; Marcellus also notes that the king&#39;s&quot;</span>
+<span class="linenos">27</span>    <span class="p">]</span>
+<span class="linenos">28</span>
+<span class="linenos">29</span>    <span class="c1"># Create a sampling params.</span>
+<span class="linenos">30</span>    <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">temperature</span><span class="o">=</span><span class="mf">0.001</span><span class="p">,</span>
+<span class="linenos">31</span>                                     <span class="n">top_p</span><span class="o">=</span><span class="mf">0.001</span><span class="p">,</span>
+<span class="linenos">32</span>                                     <span class="n">max_tokens</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
+<span class="linenos">33</span>
+<span class="linenos">34</span>    <span class="k">for</span> <span class="n">output</span> <span class="ow">in</span> <span class="n">llm</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span><span class="n">prompts</span><span class="p">,</span> <span class="n">sampling_params</span><span class="o">=</span><span class="n">sampling_params</span><span class="p">):</span>
+<span class="linenos">35</span>        <span class="nb">print</span><span class="p">(</span>
+<span class="linenos">36</span>            <span class="sa">f</span><span class="s2">&quot;Prompt: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">prompt</span><span class="si">!r}</span><span class="s2">, Generated text: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span><span class="si">!r}</span><span class="s2">&quot;</span>
+<span class="linenos">37</span>        <span class="p">)</span>
+<span class="linenos">38</span>
+<span class="linenos">39</span>    <span class="n">kv_events</span> <span class="o">=</span> <span class="n">llm</span><span class="o">.</span><span class="n">get_kv_cache_events</span><span class="p">(</span><span class="mi">10</span><span class="p">)</span>
+<span class="linenos">40</span>    <span class="nb">print</span><span class="p">(</span><span class="n">kv_events</span><span class="p">)</span>
+<span class="linenos">41</span>
+<span class="linenos">42</span>    <span class="c1"># Got output like follows:</span>
+<span class="linenos">43</span>    <span class="c1"># [{&#39;event_id&#39;: 0, &#39;data&#39;: {&#39;type&#39;: &#39;created&#39;, &#39;num_blocks_per_cache_level&#39;: [101230, 0]}},</span>
+<span class="linenos">44</span>    <span class="c1">#  {&#39;event_id&#39;: 1, &#39;data&#39;: {&#39;type&#39;: &#39;stored&#39;, &#39;parent_hash&#39;: None, &#39;blocks&#39;: [{&#39;type&#39;: &#39;stored_block&#39;, &#39;block_hash&#39;: 4203099703668305365, &#39;tokens&#39;: [{&#39;type&#39;: &#39;unique_token&#39;, &#39;token_id&#39;: 1, &#39;token_extra_id&#39;: 0}, ...</span>
+<span class="linenos">45</span>
+<span class="linenos">46</span>
+<span class="linenos">47</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
+<span class="linenos">48</span>    <span class="n">main</span><span class="p">()</span>
 </pre></div>
 </div>
 </section>
@@ -573,12 +575,12 @@
                   
 <div class="prev-next-area">
     <a class="left-prev"
-       href="llm_logits_processor.html"
+       href="llm_eagle2_decoding.html"
        title="previous page">
       <i class="fa-solid fa-angle-left"></i>
       <div class="prev-next-info">
         <p class="prev-next-subtitle">previous</p>
-        <p class="prev-next-title">Control generated text using logits processor</p>
+        <p class="prev-next-title">Generate Text Using Eagle2 Decoding</p>
       </div>
     </a>
     <a class="right-next"
@@ -687,6 +689,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_logits_processor.html b/latest/examples/llm_logits_processor.html
index 4a79379a87..a7a657defa 100644
--- a/latest/examples/llm_logits_processor.html
+++ b/latest/examples/llm_logits_processor.html
@@ -51,19 +51,19 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
     <link rel="icon" href="../_static/favicon.png"/>
     <link rel="index" title="Index" href="../genindex.html" />
     <link rel="search" title="Search" href="../search.html" />
-    <link rel="next" title="Get KV Cache Events" href="llm_inference_kv_events.html" />
+    <link rel="next" title="Generate Text Using Eagle2 Decoding" href="llm_eagle2_decoding.html" />
     <link rel="prev" title="Distributed LLM Generation" href="llm_inference_distributed.html" />
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -525,106 +529,107 @@
 <span class="linenos"> 15</span><span class="c1"># This simple callback will output a specific token at each step irrespective of prompt.</span>
 <span class="linenos"> 16</span><span class="c1"># Refer to ../bindings/executor/example_logits_processor.py for a more</span>
 <span class="linenos"> 17</span><span class="c1"># sophisticated callback that generates JSON structured output.</span>
-<span class="linenos"> 18</span><span class="k">class</span><span class="w"> </span><span class="nc">MyLogitsProcessor</span><span class="p">(</span><span class="n">LogitsProcessor</span><span class="p">):</span>
-<span class="linenos"> 19</span>
-<span class="linenos"> 20</span>    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">allowed_token_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
-<span class="linenos"> 21</span>        <span class="bp">self</span><span class="o">.</span><span class="n">allowed_token_id</span> <span class="o">=</span> <span class="n">allowed_token_id</span>
-<span class="linenos"> 22</span>
-<span class="linenos"> 23</span>    <span class="k">def</span><span class="w"> </span><span class="fm">__call__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">req_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">logits</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span>
-<span class="linenos"> 24</span>                 <span class="n">token_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]],</span> <span class="n">stream_ptr</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
-<span class="linenos"> 25</span>                 <span class="n">client_id</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]):</span>
-<span class="linenos"> 26</span>        <span class="n">mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full_like</span><span class="p">(</span><span class="n">logits</span><span class="p">,</span> <span class="n">fill_value</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;-inf&quot;</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
-<span class="linenos"> 27</span>        <span class="n">mask</span><span class="p">[:,</span> <span class="p">:,</span> <span class="bp">self</span><span class="o">.</span><span class="n">allowed_token_id</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span>
-<span class="linenos"> 28</span>
-<span class="linenos"> 29</span>        <span class="n">stream</span> <span class="o">=</span> <span class="kc">None</span> <span class="k">if</span> <span class="n">stream_ptr</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">ExternalStream</span><span class="p">(</span>
-<span class="linenos"> 30</span>            <span class="n">stream_ptr</span><span class="p">)</span>
-<span class="linenos"> 31</span>        <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">stream</span><span class="p">(</span><span class="n">stream</span><span class="p">):</span>
-<span class="linenos"> 32</span>            <span class="n">mask</span> <span class="o">=</span> <span class="n">mask</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">logits</span><span class="o">.</span><span class="n">device</span><span class="p">,</span> <span class="n">non_blocking</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="linenos"> 33</span>            <span class="n">logits</span> <span class="o">+=</span> <span class="n">mask</span>
-<span class="linenos"> 34</span>
+<span class="linenos"> 18</span><span class="c1"># Please also refer to sampling_params.py for adding subclass to the approved class list for deserialization</span>
+<span class="linenos"> 19</span><span class="k">class</span><span class="w"> </span><span class="nc">MyLogitsProcessor</span><span class="p">(</span><span class="n">LogitsProcessor</span><span class="p">):</span>
+<span class="linenos"> 20</span>
+<span class="linenos"> 21</span>    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">allowed_token_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+<span class="linenos"> 22</span>        <span class="bp">self</span><span class="o">.</span><span class="n">allowed_token_id</span> <span class="o">=</span> <span class="n">allowed_token_id</span>
+<span class="linenos"> 23</span>
+<span class="linenos"> 24</span>    <span class="k">def</span><span class="w"> </span><span class="fm">__call__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">req_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">logits</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span>
+<span class="linenos"> 25</span>                 <span class="n">token_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]],</span> <span class="n">stream_ptr</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
+<span class="linenos"> 26</span>                 <span class="n">client_id</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]):</span>
+<span class="linenos"> 27</span>        <span class="n">mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full_like</span><span class="p">(</span><span class="n">logits</span><span class="p">,</span> <span class="n">fill_value</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;-inf&quot;</span><span class="p">),</span> <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
+<span class="linenos"> 28</span>        <span class="n">mask</span><span class="p">[:,</span> <span class="p">:,</span> <span class="bp">self</span><span class="o">.</span><span class="n">allowed_token_id</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span>
+<span class="linenos"> 29</span>
+<span class="linenos"> 30</span>        <span class="n">stream</span> <span class="o">=</span> <span class="kc">None</span> <span class="k">if</span> <span class="n">stream_ptr</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">ExternalStream</span><span class="p">(</span>
+<span class="linenos"> 31</span>            <span class="n">stream_ptr</span><span class="p">)</span>
+<span class="linenos"> 32</span>        <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">stream</span><span class="p">(</span><span class="n">stream</span><span class="p">):</span>
+<span class="linenos"> 33</span>            <span class="n">mask</span> <span class="o">=</span> <span class="n">mask</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">logits</span><span class="o">.</span><span class="n">device</span><span class="p">,</span> <span class="n">non_blocking</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="linenos"> 34</span>            <span class="n">logits</span> <span class="o">+=</span> <span class="n">mask</span>
 <span class="linenos"> 35</span>
-<span class="linenos"> 36</span><span class="c1"># The recommended way to create a customized batched logits processor:</span>
-<span class="linenos"> 37</span><span class="c1">#     * Subclass BatchedLogitsProcessor and implement the processing logics in the __call__ method.</span>
-<span class="linenos"> 38</span><span class="c1">#     * Create an instance and pass to LLM.</span>
-<span class="linenos"> 39</span><span class="c1"># Alternatively, you can create any callable with the same signature with the __call__ method.</span>
-<span class="linenos"> 40</span><span class="c1"># A batched logits processor&#39;s arguments for all requests in a batch are made available as lists.</span>
-<span class="linenos"> 41</span><span class="c1"># This helps user optimize the callback for large batch sizes. For example:</span>
-<span class="linenos"> 42</span><span class="c1"># 1. Process more work on host, e.g. running a JSON state machine, in parallel with model forward pass on device.</span>
-<span class="linenos"> 43</span><span class="c1"># 2. Coalesce H2D memory transfers for all requests into a single cudaMemcpyAsync call.</span>
-<span class="linenos"> 44</span><span class="c1"># 3. Launch a single batched kernel, e.g. for updating logits on device.</span>
-<span class="linenos"> 45</span><span class="k">class</span><span class="w"> </span><span class="nc">MyBatchedLogitsProcessor</span><span class="p">(</span><span class="n">BatchedLogitsProcessor</span><span class="p">):</span>
-<span class="linenos"> 46</span>
-<span class="linenos"> 47</span>    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">allowed_token_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
-<span class="linenos"> 48</span>        <span class="bp">self</span><span class="o">.</span><span class="n">allowed_token_id</span> <span class="o">=</span> <span class="n">allowed_token_id</span>
-<span class="linenos"> 49</span>
-<span class="linenos"> 50</span>    <span class="k">def</span><span class="w"> </span><span class="fm">__call__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">req_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">logits</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">],</span>
-<span class="linenos"> 51</span>                 <span class="n">token_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]],</span> <span class="n">stream_ptr</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
-<span class="linenos"> 52</span>                 <span class="n">client_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]]):</span>
-<span class="linenos"> 53</span>        <span class="c1"># Generate masks for all requests on host</span>
-<span class="linenos"> 54</span>        <span class="n">masks</span> <span class="o">=</span> <span class="p">[]</span>
-<span class="linenos"> 55</span>        <span class="k">for</span> <span class="n">req_id</span><span class="p">,</span> <span class="n">req_logits</span><span class="p">,</span> <span class="n">req_token_ids</span><span class="p">,</span> <span class="n">client_id</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span>
-<span class="linenos"> 56</span>                <span class="n">req_ids</span><span class="p">,</span> <span class="n">logits</span><span class="p">,</span> <span class="n">token_ids</span><span class="p">,</span> <span class="n">client_ids</span><span class="p">):</span>
-<span class="linenos"> 57</span>            <span class="n">mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full_like</span><span class="p">(</span><span class="n">req_logits</span><span class="p">,</span>
-<span class="linenos"> 58</span>                                   <span class="n">fill_value</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;-inf&quot;</span><span class="p">),</span>
-<span class="linenos"> 59</span>                                   <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
-<span class="linenos"> 60</span>            <span class="n">mask</span><span class="p">[:,</span> <span class="p">:,</span> <span class="bp">self</span><span class="o">.</span><span class="n">allowed_token_id</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span>
-<span class="linenos"> 61</span>            <span class="n">masks</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">mask</span><span class="p">)</span>
-<span class="linenos"> 62</span>
-<span class="linenos"> 63</span>        <span class="c1"># Move masks to device and add to logits using non-blocking operations</span>
-<span class="linenos"> 64</span>        <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">stream</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">ExternalStream</span><span class="p">(</span><span class="n">stream_ptr</span><span class="p">)):</span>
-<span class="linenos"> 65</span>            <span class="k">for</span> <span class="n">req_logits</span><span class="p">,</span> <span class="n">mask</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">logits</span><span class="p">,</span> <span class="n">masks</span><span class="p">):</span>
-<span class="linenos"> 66</span>                <span class="n">req_logits</span> <span class="o">+=</span> <span class="n">mask</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">req_logits</span><span class="o">.</span><span class="n">device</span><span class="p">,</span> <span class="n">non_blocking</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="linenos"> 67</span>
+<span class="linenos"> 36</span>
+<span class="linenos"> 37</span><span class="c1"># The recommended way to create a customized batched logits processor:</span>
+<span class="linenos"> 38</span><span class="c1">#     * Subclass BatchedLogitsProcessor and implement the processing logics in the __call__ method.</span>
+<span class="linenos"> 39</span><span class="c1">#     * Create an instance and pass to LLM.</span>
+<span class="linenos"> 40</span><span class="c1"># Alternatively, you can create any callable with the same signature with the __call__ method.</span>
+<span class="linenos"> 41</span><span class="c1"># A batched logits processor&#39;s arguments for all requests in a batch are made available as lists.</span>
+<span class="linenos"> 42</span><span class="c1"># This helps user optimize the callback for large batch sizes. For example:</span>
+<span class="linenos"> 43</span><span class="c1"># 1. Process more work on host, e.g. running a JSON state machine, in parallel with model forward pass on device.</span>
+<span class="linenos"> 44</span><span class="c1"># 2. Coalesce H2D memory transfers for all requests into a single cudaMemcpyAsync call.</span>
+<span class="linenos"> 45</span><span class="c1"># 3. Launch a single batched kernel, e.g. for updating logits on device.</span>
+<span class="linenos"> 46</span><span class="k">class</span><span class="w"> </span><span class="nc">MyBatchedLogitsProcessor</span><span class="p">(</span><span class="n">BatchedLogitsProcessor</span><span class="p">):</span>
+<span class="linenos"> 47</span>
+<span class="linenos"> 48</span>    <span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">allowed_token_id</span><span class="p">:</span> <span class="nb">int</span><span class="p">):</span>
+<span class="linenos"> 49</span>        <span class="bp">self</span><span class="o">.</span><span class="n">allowed_token_id</span> <span class="o">=</span> <span class="n">allowed_token_id</span>
+<span class="linenos"> 50</span>
+<span class="linenos"> 51</span>    <span class="k">def</span><span class="w"> </span><span class="fm">__call__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">req_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">],</span> <span class="n">logits</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">],</span>
+<span class="linenos"> 52</span>                 <span class="n">token_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]]],</span> <span class="n">stream_ptr</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
+<span class="linenos"> 53</span>                 <span class="n">client_ids</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]]):</span>
+<span class="linenos"> 54</span>        <span class="c1"># Generate masks for all requests on host</span>
+<span class="linenos"> 55</span>        <span class="n">masks</span> <span class="o">=</span> <span class="p">[]</span>
+<span class="linenos"> 56</span>        <span class="k">for</span> <span class="n">req_id</span><span class="p">,</span> <span class="n">req_logits</span><span class="p">,</span> <span class="n">req_token_ids</span><span class="p">,</span> <span class="n">client_id</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span>
+<span class="linenos"> 57</span>                <span class="n">req_ids</span><span class="p">,</span> <span class="n">logits</span><span class="p">,</span> <span class="n">token_ids</span><span class="p">,</span> <span class="n">client_ids</span><span class="p">):</span>
+<span class="linenos"> 58</span>            <span class="n">mask</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">full_like</span><span class="p">(</span><span class="n">req_logits</span><span class="p">,</span>
+<span class="linenos"> 59</span>                                   <span class="n">fill_value</span><span class="o">=</span><span class="nb">float</span><span class="p">(</span><span class="s2">&quot;-inf&quot;</span><span class="p">),</span>
+<span class="linenos"> 60</span>                                   <span class="n">device</span><span class="o">=</span><span class="s2">&quot;cpu&quot;</span><span class="p">)</span>
+<span class="linenos"> 61</span>            <span class="n">mask</span><span class="p">[:,</span> <span class="p">:,</span> <span class="bp">self</span><span class="o">.</span><span class="n">allowed_token_id</span><span class="p">]</span> <span class="o">=</span> <span class="mi">0</span>
+<span class="linenos"> 62</span>            <span class="n">masks</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">mask</span><span class="p">)</span>
+<span class="linenos"> 63</span>
+<span class="linenos"> 64</span>        <span class="c1"># Move masks to device and add to logits using non-blocking operations</span>
+<span class="linenos"> 65</span>        <span class="k">with</span> <span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">stream</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">ExternalStream</span><span class="p">(</span><span class="n">stream_ptr</span><span class="p">)):</span>
+<span class="linenos"> 66</span>            <span class="k">for</span> <span class="n">req_logits</span><span class="p">,</span> <span class="n">mask</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">logits</span><span class="p">,</span> <span class="n">masks</span><span class="p">):</span>
+<span class="linenos"> 67</span>                <span class="n">req_logits</span> <span class="o">+=</span> <span class="n">mask</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">req_logits</span><span class="o">.</span><span class="n">device</span><span class="p">,</span> <span class="n">non_blocking</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
 <span class="linenos"> 68</span>
-<span class="linenos"> 69</span><span class="k">def</span><span class="w"> </span><span class="nf">main</span><span class="p">():</span>
-<span class="linenos"> 70</span>
-<span class="linenos"> 71</span>    <span class="c1"># Batched logits processor (only supported in TensorRT backend)</span>
-<span class="linenos"> 72</span>    <span class="c1"># should be specified when initializing LLM.</span>
-<span class="linenos"> 73</span>    <span class="n">llm</span> <span class="o">=</span> <span class="n">LLM</span><span class="p">(</span>
-<span class="linenos"> 74</span>        <span class="n">model</span><span class="o">=</span><span class="s2">&quot;TinyLlama/TinyLlama-1.1B-Chat-v1.0&quot;</span><span class="p">,</span>
-<span class="linenos"> 75</span>        <span class="n">batched_logits_processor</span><span class="o">=</span><span class="n">MyBatchedLogitsProcessor</span><span class="p">(</span><span class="n">allowed_token_id</span><span class="o">=</span><span class="mi">42</span><span class="p">))</span>
-<span class="linenos"> 76</span>
-<span class="linenos"> 77</span>    <span class="c1"># Sample prompts</span>
-<span class="linenos"> 78</span>    <span class="n">prompts</span> <span class="o">=</span> <span class="p">[</span>
-<span class="linenos"> 79</span>        <span class="s2">&quot;Hello, my name is&quot;</span><span class="p">,</span>
-<span class="linenos"> 80</span>        <span class="s2">&quot;The president of the United States is&quot;</span><span class="p">,</span>
-<span class="linenos"> 81</span>    <span class="p">]</span>
-<span class="linenos"> 82</span>
-<span class="linenos"> 83</span>    <span class="c1"># Generate text</span>
-<span class="linenos"> 84</span>    <span class="k">for</span> <span class="n">prompt_id</span><span class="p">,</span> <span class="n">prompt</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">prompts</span><span class="p">):</span>
-<span class="linenos"> 85</span>        <span class="c1"># Use non-batched logits processor callback only for odd-numbered prompts</span>
-<span class="linenos"> 86</span>        <span class="k">if</span> <span class="n">prompt_id</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
-<span class="linenos"> 87</span>            <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">temperature</span><span class="o">=</span><span class="mf">0.8</span><span class="p">,</span> <span class="n">top_p</span><span class="o">=</span><span class="mf">0.95</span><span class="p">)</span>
-<span class="linenos"> 88</span>        <span class="k">else</span><span class="p">:</span>
-<span class="linenos"> 89</span>            <span class="c1"># Each prompt can be specified with a logits processor at runtime</span>
-<span class="linenos"> 90</span>            <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span>
-<span class="linenos"> 91</span>                <span class="n">temperature</span><span class="o">=</span><span class="mf">0.8</span><span class="p">,</span>
-<span class="linenos"> 92</span>                <span class="n">top_p</span><span class="o">=</span><span class="mf">0.95</span><span class="p">,</span>
-<span class="linenos"> 93</span>                <span class="n">logits_processor</span><span class="o">=</span><span class="n">MyLogitsProcessor</span><span class="p">(</span><span class="n">allowed_token_id</span><span class="o">=</span><span class="mi">42</span><span class="p">))</span>
-<span class="linenos"> 94</span>
-<span class="linenos"> 95</span>        <span class="k">for</span> <span class="n">output</span> <span class="ow">in</span> <span class="n">llm</span><span class="o">.</span><span class="n">generate</span><span class="p">([</span><span class="n">prompt</span><span class="p">],</span> <span class="n">sampling_params</span><span class="p">):</span>
-<span class="linenos"> 96</span>            <span class="nb">print</span><span class="p">(</span>
-<span class="linenos"> 97</span>                <span class="sa">f</span><span class="s2">&quot;Prompt: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">prompt</span><span class="si">!r}</span><span class="s2">, Generated text: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span><span class="si">!r}</span><span class="s2">&quot;</span>
-<span class="linenos"> 98</span>            <span class="p">)</span>
-<span class="linenos"> 99</span>
-<span class="linenos">100</span>    <span class="c1"># Got output like</span>
-<span class="linenos">101</span>    <span class="c1"># Prompt: &#39;Hello, my name is&#39;, Generated text: &#39;\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming&#39;</span>
-<span class="linenos">102</span>    <span class="c1"># Prompt: &#39;The president of the United States is&#39;, Generated text: &quot;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&quot;</span>
-<span class="linenos">103</span>
-<span class="linenos">104</span>    <span class="c1"># Use batched processor with batch size = 2</span>
-<span class="linenos">105</span>    <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">apply_batched_logits_processor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
-<span class="linenos">106</span>    <span class="k">for</span> <span class="n">output</span> <span class="ow">in</span> <span class="n">llm</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span><span class="n">prompts</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">):</span>
-<span class="linenos">107</span>        <span class="nb">print</span><span class="p">(</span>
-<span class="linenos">108</span>            <span class="sa">f</span><span class="s2">&quot;Prompt: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">prompt</span><span class="si">!r}</span><span class="s2">, Generated text: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span><span class="si">!r}</span><span class="s2">&quot;</span>
-<span class="linenos">109</span>        <span class="p">)</span>
-<span class="linenos">110</span>
-<span class="linenos">111</span>    <span class="c1"># Got output like</span>
-<span class="linenos">112</span>    <span class="c1"># Prompt: &#39;Hello, my name is&#39;, Generated text: &quot;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&quot;</span>
-<span class="linenos">113</span>    <span class="c1"># Prompt: &#39;The president of the United States is&#39;, Generated text: &quot;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&quot;</span>
-<span class="linenos">114</span>
+<span class="linenos"> 69</span>
+<span class="linenos"> 70</span><span class="k">def</span><span class="w"> </span><span class="nf">main</span><span class="p">():</span>
+<span class="linenos"> 71</span>
+<span class="linenos"> 72</span>    <span class="c1"># Batched logits processor (only supported in TensorRT backend)</span>
+<span class="linenos"> 73</span>    <span class="c1"># should be specified when initializing LLM.</span>
+<span class="linenos"> 74</span>    <span class="n">llm</span> <span class="o">=</span> <span class="n">LLM</span><span class="p">(</span>
+<span class="linenos"> 75</span>        <span class="n">model</span><span class="o">=</span><span class="s2">&quot;TinyLlama/TinyLlama-1.1B-Chat-v1.0&quot;</span><span class="p">,</span>
+<span class="linenos"> 76</span>        <span class="n">batched_logits_processor</span><span class="o">=</span><span class="n">MyBatchedLogitsProcessor</span><span class="p">(</span><span class="n">allowed_token_id</span><span class="o">=</span><span class="mi">42</span><span class="p">))</span>
+<span class="linenos"> 77</span>
+<span class="linenos"> 78</span>    <span class="c1"># Sample prompts</span>
+<span class="linenos"> 79</span>    <span class="n">prompts</span> <span class="o">=</span> <span class="p">[</span>
+<span class="linenos"> 80</span>        <span class="s2">&quot;Hello, my name is&quot;</span><span class="p">,</span>
+<span class="linenos"> 81</span>        <span class="s2">&quot;The president of the United States is&quot;</span><span class="p">,</span>
+<span class="linenos"> 82</span>    <span class="p">]</span>
+<span class="linenos"> 83</span>
+<span class="linenos"> 84</span>    <span class="c1"># Generate text</span>
+<span class="linenos"> 85</span>    <span class="k">for</span> <span class="n">prompt_id</span><span class="p">,</span> <span class="n">prompt</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">prompts</span><span class="p">):</span>
+<span class="linenos"> 86</span>        <span class="c1"># Use non-batched logits processor callback only for odd-numbered prompts</span>
+<span class="linenos"> 87</span>        <span class="k">if</span> <span class="n">prompt_id</span> <span class="o">%</span> <span class="mi">2</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+<span class="linenos"> 88</span>            <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">temperature</span><span class="o">=</span><span class="mf">0.8</span><span class="p">,</span> <span class="n">top_p</span><span class="o">=</span><span class="mf">0.95</span><span class="p">)</span>
+<span class="linenos"> 89</span>        <span class="k">else</span><span class="p">:</span>
+<span class="linenos"> 90</span>            <span class="c1"># Each prompt can be specified with a logits processor at runtime</span>
+<span class="linenos"> 91</span>            <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span>
+<span class="linenos"> 92</span>                <span class="n">temperature</span><span class="o">=</span><span class="mf">0.8</span><span class="p">,</span>
+<span class="linenos"> 93</span>                <span class="n">top_p</span><span class="o">=</span><span class="mf">0.95</span><span class="p">,</span>
+<span class="linenos"> 94</span>                <span class="n">logits_processor</span><span class="o">=</span><span class="n">MyLogitsProcessor</span><span class="p">(</span><span class="n">allowed_token_id</span><span class="o">=</span><span class="mi">42</span><span class="p">))</span>
+<span class="linenos"> 95</span>
+<span class="linenos"> 96</span>        <span class="k">for</span> <span class="n">output</span> <span class="ow">in</span> <span class="n">llm</span><span class="o">.</span><span class="n">generate</span><span class="p">([</span><span class="n">prompt</span><span class="p">],</span> <span class="n">sampling_params</span><span class="p">):</span>
+<span class="linenos"> 97</span>            <span class="nb">print</span><span class="p">(</span>
+<span class="linenos"> 98</span>                <span class="sa">f</span><span class="s2">&quot;Prompt: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">prompt</span><span class="si">!r}</span><span class="s2">, Generated text: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span><span class="si">!r}</span><span class="s2">&quot;</span>
+<span class="linenos"> 99</span>            <span class="p">)</span>
+<span class="linenos">100</span>
+<span class="linenos">101</span>    <span class="c1"># Got output like</span>
+<span class="linenos">102</span>    <span class="c1"># Prompt: &#39;Hello, my name is&#39;, Generated text: &#39;\n\nJane Smith. I am a student pursuing my degree in Computer Science at [university]. I enjoy learning new things, especially technology and programming&#39;</span>
+<span class="linenos">103</span>    <span class="c1"># Prompt: &#39;The president of the United States is&#39;, Generated text: &quot;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&quot;</span>
+<span class="linenos">104</span>
+<span class="linenos">105</span>    <span class="c1"># Use batched processor with batch size = 2</span>
+<span class="linenos">106</span>    <span class="n">sampling_params</span> <span class="o">=</span> <span class="n">SamplingParams</span><span class="p">(</span><span class="n">apply_batched_logits_processor</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+<span class="linenos">107</span>    <span class="k">for</span> <span class="n">output</span> <span class="ow">in</span> <span class="n">llm</span><span class="o">.</span><span class="n">generate</span><span class="p">(</span><span class="n">prompts</span><span class="p">,</span> <span class="n">sampling_params</span><span class="p">):</span>
+<span class="linenos">108</span>        <span class="nb">print</span><span class="p">(</span>
+<span class="linenos">109</span>            <span class="sa">f</span><span class="s2">&quot;Prompt: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">prompt</span><span class="si">!r}</span><span class="s2">, Generated text: </span><span class="si">{</span><span class="n">output</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">text</span><span class="si">!r}</span><span class="s2">&quot;</span>
+<span class="linenos">110</span>        <span class="p">)</span>
+<span class="linenos">111</span>
+<span class="linenos">112</span>    <span class="c1"># Got output like</span>
+<span class="linenos">113</span>    <span class="c1"># Prompt: &#39;Hello, my name is&#39;, Generated text: &quot;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&quot;</span>
+<span class="linenos">114</span>    <span class="c1"># Prompt: &#39;The president of the United States is&#39;, Generated text: &quot;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&#39;&quot;</span>
 <span class="linenos">115</span>
-<span class="linenos">116</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
-<span class="linenos">117</span>    <span class="n">main</span><span class="p">()</span>
+<span class="linenos">116</span>
+<span class="linenos">117</span><span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
+<span class="linenos">118</span>    <span class="n">main</span><span class="p">()</span>
 </pre></div>
 </div>
 </section>
@@ -649,11 +654,11 @@
       </div>
     </a>
     <a class="right-next"
-       href="llm_inference_kv_events.html"
+       href="llm_eagle2_decoding.html"
        title="next page">
       <div class="prev-next-info">
         <p class="prev-next-subtitle">next</p>
-        <p class="prev-next-title">Get KV Cache Events</p>
+        <p class="prev-next-title">Generate Text Using Eagle2 Decoding</p>
       </div>
       <i class="fa-solid fa-angle-right"></i>
     </a>
@@ -754,6 +759,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_lookahead_decoding.html b/latest/examples/llm_lookahead_decoding.html
index 296fb15b35..50c27b1e81 100644
--- a/latest/examples/llm_lookahead_decoding.html
+++ b/latest/examples/llm_lookahead_decoding.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -675,6 +679,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_medusa_decoding.html b/latest/examples/llm_medusa_decoding.html
index b38f8a6513..72f63e2800 100644
--- a/latest/examples/llm_medusa_decoding.html
+++ b/latest/examples/llm_medusa_decoding.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -731,6 +735,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_mgmn_llm_distributed.html b/latest/examples/llm_mgmn_llm_distributed.html
index d8f464d971..ca1735624e 100644
--- a/latest/examples/llm_mgmn_llm_distributed.html
+++ b/latest/examples/llm_mgmn_llm_distributed.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -690,6 +694,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_mgmn_trtllm_bench.html b/latest/examples/llm_mgmn_trtllm_bench.html
index 0a3c070501..c6f424b678 100644
--- a/latest/examples/llm_mgmn_trtllm_bench.html
+++ b/latest/examples/llm_mgmn_trtllm_bench.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -584,25 +588,24 @@
 <span class="linenos">74</span>
 <span class="linenos">75</span><span class="s2">        # This is optional</span>
 <span class="linenos">76</span><span class="s2">        cat &gt; /tmp/pytorch_extra_args.txt &lt;&lt; EOF</span>
-<span class="linenos">77</span><span class="s2">pytorch_backend_config:</span>
-<span class="linenos">78</span><span class="s2">    use_cuda_graph: false</span>
-<span class="linenos">79</span><span class="s2">    cuda_graph_padding_enabled: false</span>
-<span class="linenos">80</span><span class="s2">    print_iter_log: true</span>
-<span class="linenos">81</span><span class="s2">enable_attention_dp: false</span>
-<span class="linenos">82</span><span class="s2">EOF</span>
-<span class="linenos">83</span>
-<span class="linenos">84</span><span class="s2">        # launch the benchmark</span>
-<span class="linenos">85</span><span class="s2">        trtllm-llmapi-launch \</span>
-<span class="linenos">86</span><span class="s2">         trtllm-bench \</span>
-<span class="linenos">87</span><span class="s2">            --model </span><span class="nv">$MODEL_NAME</span><span class="s2"> \</span>
-<span class="linenos">88</span><span class="s2">            --model_path </span><span class="nv">$LOCAL_MODEL</span><span class="s2"> \</span>
-<span class="linenos">89</span><span class="s2">            throughput \</span>
-<span class="linenos">90</span><span class="s2">            --dataset </span><span class="nv">$data_path</span><span class="s2"> \</span>
-<span class="linenos">91</span><span class="s2">            --backend pytorch \</span>
-<span class="linenos">92</span><span class="s2">            --tp 16 \</span>
-<span class="linenos">93</span><span class="s2">            --extra_llm_api_options /tmp/pytorch_extra_args.txt \</span>
-<span class="linenos">94</span><span class="s2">            </span><span class="nv">$EXTRA_ARGS</span>
-<span class="linenos">95</span><span class="s2">    &quot;</span>
+<span class="linenos">77</span><span class="s2">use_cuda_graph: false</span>
+<span class="linenos">78</span><span class="s2">cuda_graph_padding_enabled: false</span>
+<span class="linenos">79</span><span class="s2">print_iter_log: true</span>
+<span class="linenos">80</span><span class="s2">enable_attention_dp: false</span>
+<span class="linenos">81</span><span class="s2">EOF</span>
+<span class="linenos">82</span>
+<span class="linenos">83</span><span class="s2">        # launch the benchmark</span>
+<span class="linenos">84</span><span class="s2">        trtllm-llmapi-launch \</span>
+<span class="linenos">85</span><span class="s2">         trtllm-bench \</span>
+<span class="linenos">86</span><span class="s2">            --model </span><span class="nv">$MODEL_NAME</span><span class="s2"> \</span>
+<span class="linenos">87</span><span class="s2">            --model_path </span><span class="nv">$LOCAL_MODEL</span><span class="s2"> \</span>
+<span class="linenos">88</span><span class="s2">            throughput \</span>
+<span class="linenos">89</span><span class="s2">            --dataset </span><span class="nv">$data_path</span><span class="s2"> \</span>
+<span class="linenos">90</span><span class="s2">            --backend pytorch \</span>
+<span class="linenos">91</span><span class="s2">            --tp 16 \</span>
+<span class="linenos">92</span><span class="s2">            --extra_llm_api_options /tmp/pytorch_extra_args.txt \</span>
+<span class="linenos">93</span><span class="s2">            </span><span class="nv">$EXTRA_ARGS</span>
+<span class="linenos">94</span><span class="s2">    &quot;</span>
 </pre></div>
 </div>
 </section>
@@ -732,6 +735,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_mgmn_trtllm_serve.html b/latest/examples/llm_mgmn_trtllm_serve.html
index 6699772866..b5915e1fad 100644
--- a/latest/examples/llm_mgmn_trtllm_serve.html
+++ b/latest/examples/llm_mgmn_trtllm_serve.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -692,6 +696,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_multilora.html b/latest/examples/llm_multilora.html
index f667432cdd..0759100ce6 100644
--- a/latest/examples/llm_multilora.html
+++ b/latest/examples/llm_multilora.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -696,6 +700,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/llm_quantization.html b/latest/examples/llm_quantization.html
index 0b8f09ac85..ead01073f4 100644
--- a/latest/examples/llm_quantization.html
+++ b/latest/examples/llm_quantization.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2 current active"><a class="current reference internal" href="#">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -718,6 +722,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/openai_chat_client.html b/latest/examples/openai_chat_client.html
index 322a6e551c..24cd0bedbb 100644
--- a/latest/examples/openai_chat_client.html
+++ b/latest/examples/openai_chat_client.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -659,6 +663,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/openai_chat_client_for_multimodal.html b/latest/examples/openai_chat_client_for_multimodal.html
index 11824e4584..4d8104a432 100644
--- a/latest/examples/openai_chat_client_for_multimodal.html
+++ b/latest/examples/openai_chat_client_for_multimodal.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -752,6 +756,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/openai_completion_client.html b/latest/examples/openai_completion_client.html
index b8ed793432..8906617c30 100644
--- a/latest/examples/openai_completion_client.html
+++ b/latest/examples/openai_completion_client.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -653,6 +657,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/examples/trtllm_serve_examples.html b/latest/examples/trtllm_serve_examples.html
index 032e14d90a..e25b6bc98f 100644
--- a/latest/examples/trtllm_serve_examples.html
+++ b/latest/examples/trtllm_serve_examples.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -645,6 +649,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/genindex.html b/latest/genindex.html
index baa30c67e3..fb5b2ecb5d 100644
--- a/latest/genindex.html
+++ b/latest/genindex.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -60,7 +60,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -329,6 +329,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -350,6 +351,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -414,6 +416,7 @@
 <li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -448,6 +451,7 @@
 <li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -775,14 +779,14 @@
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.AllReduceParams">AllReduceParams (class in tensorrt_llm.functional)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.AllReduceStrategy">AllReduceStrategy (class in tensorrt_llm.functional)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams.apply_batched_logits_processor">apply_batched_logits_processor (tensorrt_llm.llmapi.SamplingParams attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.RopeEmbeddingUtils.apply_llama3_scaling">apply_llama3_scaling() (tensorrt_llm.functional.RopeEmbeddingUtils static method)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.RopeEmbeddingUtils.apply_rotary_pos_emb">apply_rotary_pos_emb() (tensorrt_llm.functional.RopeEmbeddingUtils static method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.RopeEmbeddingUtils.apply_rotary_pos_emb_chatglm">apply_rotary_pos_emb_chatglm() (tensorrt_llm.functional.RopeEmbeddingUtils static method)</a>
@@ -804,14 +808,28 @@
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.AttentionMaskType">AttentionMaskType (class in tensorrt_llm.functional)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.attention.AttentionParams">AttentionParams (class in tensorrt_llm.layers.attention)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.attn_backend">attn_backend (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.SD3Transformer2DModel.attn_processors">attn_processors (tensorrt_llm.models.SD3Transformer2DModel property)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.audio_engine_dir">audio_engine_dir (tensorrt_llm.runtime.MultimodalModelRunner property)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.AllReduceStrategy.AUTO">AUTO (tensorrt_llm.functional.AllReduceStrategy attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.auto_deploy_config">auto_deploy_config (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel">auto_parallel (tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.BuildConfig.auto_parallel_config">auto_parallel_config (tensorrt_llm.llmapi.BuildConfig attribute)</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_config">(tensorrt_llm.llmapi.TrtLlmArgs property)</a>
+</li>
+      </ul></li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_world_size">auto_parallel_world_size (tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.autotuner_enabled">autotuner_enabled (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.avg_pool2d">avg_pool2d() (in module tensorrt_llm.functional)</a>
 </li>
@@ -851,10 +869,10 @@
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.BertForQuestionAnswering">BertForQuestionAnswering (class in tensorrt_llm.models)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.BertForSequenceClassification">BertForSequenceClassification (class in tensorrt_llm.models)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.BertModel">BertModel (class in tensorrt_llm.models)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams.best_of">best_of (tensorrt_llm.llmapi.SamplingParams attribute)</a>
@@ -875,6 +893,12 @@
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.buffer_allocated">buffer_allocated (tensorrt_llm.runtime.GenerationSession attribute)</a>
 </li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.build_config">build_config (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.build_config">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
+</li>
+      </ul></li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.BuildCacheConfig">BuildCacheConfig (class in tensorrt_llm.llmapi)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.BuildConfig">BuildConfig (class in tensorrt_llm.llmapi)</a>
@@ -898,6 +922,8 @@
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.CalibConfig.calib_batch_size">calib_batch_size (tensorrt_llm.llmapi.CalibConfig attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.CalibConfig.calib_batches">calib_batches (tensorrt_llm.llmapi.CalibConfig attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.calib_config">calib_config (tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.CalibConfig.calib_dataset">calib_dataset (tensorrt_llm.llmapi.CalibConfig attribute)</a>
 </li>
@@ -1070,6 +1096,8 @@
 </li>
       </ul></li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.conv_transpose2d">conv_transpose2d() (in module tensorrt_llm.functional)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.convert_load_format">convert_load_format() (tensorrt_llm.llmapi.TorchLlmArgs class method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.conv.ConvTranspose2d">ConvTranspose2d (class in tensorrt_llm.layers.conv)</a>
 </li>
@@ -1112,8 +1140,12 @@
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.KvCacheConfig.cross_kv_cache_fraction">cross_kv_cache_fraction (tensorrt_llm.llmapi.KvCacheConfig attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.DisaggregatedParams.ctx_request_id">ctx_request_id (tensorrt_llm.llmapi.DisaggregatedParams attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_batch_sizes">cuda_graph_batch_sizes (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_cache_size">cuda_graph_cache_size (tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_max_batch_size">cuda_graph_max_batch_size (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_mode">cuda_graph_mode (tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig attribute)</a>
 
@@ -1121,6 +1153,8 @@
         <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.cuda_graph_mode">(tensorrt_llm.runtime.GenerationSession attribute)</a>
 </li>
       </ul></li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_padding_enabled">cuda_graph_padding_enabled (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.cuda_stream_guard">cuda_stream_guard() (tensorrt_llm.runtime.GenerationSession method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.cuda_stream_sync">cuda_stream_sync() (in module tensorrt_llm.functional)</a>
@@ -1163,6 +1197,12 @@
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.DecoderModel">DecoderModel (class in tensorrt_llm.models)</a>
 </li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.decoding_config">decoding_config (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.decoding_config">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
+</li>
+      </ul></li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.EagleDecodingConfig.decoding_type">decoding_type (tensorrt_llm.llmapi.EagleDecodingConfig attribute)</a>
 
       <ul>
@@ -1171,6 +1211,8 @@
         <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.MedusaDecodingConfig.decoding_type">(tensorrt_llm.llmapi.MedusaDecodingConfig attribute)</a>
 </li>
         <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig.decoding_type">(tensorrt_llm.llmapi.MTPDecodingConfig attribute)</a>
+</li>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.decoding_type">(tensorrt_llm.llmapi.NGramDecodingConfig attribute)</a>
 </li>
       </ul></li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.DeepseekForCausalLM">DeepseekForCausalLM (class in tensorrt_llm.models)</a>
@@ -1200,10 +1242,14 @@
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.attention.DiffusersAttention">DiffusersAttention (class in tensorrt_llm.layers.attention)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.DimRange">DimRange (class in tensorrt_llm.functional)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.KvCacheRetentionConfig.directory">directory (tensorrt_llm.llmapi.KvCacheRetentionConfig property)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.SideStreamIDType.disable">disable (tensorrt_llm.functional.SideStreamIDType attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.SD3Transformer2DModel.disable_forward_chunking">disable_forward_chunking() (tensorrt_llm.models.SD3Transformer2DModel method)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.disable_overlap_scheduler">disable_overlap_scheduler (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.CompletionOutput.disaggregated_params">disaggregated_params (tensorrt_llm.llmapi.CompletionOutput attribute)</a>
 </li>
@@ -1256,6 +1302,8 @@
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.SpeculativeDecodingMode.EAGLE">EAGLE (tensorrt_llm.models.SpeculativeDecodingMode attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.EagleDecodingConfig.eagle3_one_model">eagle3_one_model (tensorrt_llm.llmapi.EagleDecodingConfig attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.EagleDecodingConfig.eagle_choices">eagle_choices (tensorrt_llm.llmapi.EagleDecodingConfig attribute)</a>
 </li>
@@ -1280,10 +1328,14 @@
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.embedding">embedding() (in module tensorrt_llm.functional)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams.embedding_bias">embedding_bias (tensorrt_llm.llmapi.SamplingParams attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.embedding_parallel_mode">embedding_parallel_mode (tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.DynamicBatchConfig.enable_batch_size_tuning">enable_batch_size_tuning (tensorrt_llm.llmapi.DynamicBatchConfig attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.KvCacheConfig.enable_block_reuse">enable_block_reuse (tensorrt_llm.llmapi.KvCacheConfig attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.enable_build_cache">enable_build_cache (tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.enable_context_fmha_fp32_acc">enable_context_fmha_fp32_acc (tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig attribute)</a>
 </li>
@@ -1291,12 +1343,24 @@
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.SD3Transformer2DModel.enable_forward_chunking">enable_forward_chunking() (tensorrt_llm.models.SD3Transformer2DModel method)</a>
 </li>
-      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.DynamicBatchConfig.enable_max_num_tokens_tuning">enable_max_num_tokens_tuning (tensorrt_llm.llmapi.DynamicBatchConfig attribute)</a>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_perf_stats">enable_iter_perf_stats (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
-      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.KvCacheConfig.enable_partial_reuse">enable_partial_reuse (tensorrt_llm.llmapi.KvCacheConfig attribute)</a>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_req_stats">enable_iter_req_stats (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.enable_layerwise_nvtx_marker">enable_layerwise_nvtx_marker (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.DynamicBatchConfig.enable_max_num_tokens_tuning">enable_max_num_tokens_tuning (tensorrt_llm.llmapi.DynamicBatchConfig attribute)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.enable_min_latency">enable_min_latency (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.KvCacheConfig.enable_partial_reuse">enable_partial_reuse (tensorrt_llm.llmapi.KvCacheConfig attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.enable_tqdm">enable_tqdm (tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.enable_trtllm_sampler">enable_trtllm_sampler (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.EncDecModelRunner">EncDecModelRunner (class in tensorrt_llm.runtime)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.EncDecModelRunner.encoder_run">encoder_run() (tensorrt_llm.runtime.EncDecModelRunner method)</a>
@@ -1334,8 +1398,12 @@
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.expand_mask">expand_mask() (in module tensorrt_llm.functional)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.SpeculativeDecodingMode.EXPLICIT_DRAFT_TOKENS">EXPLICIT_DRAFT_TOKENS (tensorrt_llm.models.SpeculativeDecodingMode attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.extended_runtime_perf_knob_config">extended_runtime_perf_knob_config (tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig">ExtendedRuntimePerfKnobConfig (class in tensorrt_llm.llmapi)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.extra_resource_managers">extra_resource_managers (tensorrt_llm.llmapi.TorchLlmArgs property)</a>
 </li>
   </ul></td>
 </tr></table>
@@ -1348,6 +1416,8 @@
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.FalconForCausalLM">FalconForCausalLM (class in tensorrt_llm.models)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.FalconModel">FalconModel (class in tensorrt_llm.models)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.fast_build">fast_build (tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.mlp.FusedGatedMLP.fc_gate">fc_gate() (tensorrt_llm.layers.mlp.FusedGatedMLP method)</a>
 </li>
@@ -1357,6 +1427,12 @@
 </li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.mlp.FusedGatedMLP.fc_gate_plugin">fc_gate_plugin() (tensorrt_llm.layers.mlp.FusedGatedMLP method)</a>
 </li>
+      <li><a href="llm-api/reference.html#id12">field_name (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>, <a href="llm-api/reference.html#id15">[1]</a>, <a href="llm-api/reference.html#id18">[2]</a>, <a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.field_name">[3]</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#id21">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>, <a href="llm-api/reference.html#id24">[1]</a>, <a href="llm-api/reference.html#id27">[2]</a>, <a href="llm-api/reference.html#id30">[3]</a>, <a href="llm-api/reference.html#id33">[4]</a>, <a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.field_name">[5]</a>
+</li>
+      </ul></li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.attention.AttentionParams.fill_attention_const_params_for_long_rope">fill_attention_const_params_for_long_rope() (tensorrt_llm.layers.attention.AttentionParams method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.attention.AttentionParams.fill_attention_const_params_for_rope">fill_attention_const_params_for_rope() (tensorrt_llm.layers.attention.AttentionParams method)</a>
@@ -1571,6 +1647,8 @@
         <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.MedusaDecodingConfig.from_dict">(tensorrt_llm.llmapi.MedusaDecodingConfig class method)</a>
 </li>
         <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig.from_dict">(tensorrt_llm.llmapi.MTPDecodingConfig class method)</a>
+</li>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.from_dict">(tensorrt_llm.llmapi.NGramDecodingConfig class method)</a>
 </li>
         <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.QuantConfig.from_dict">(tensorrt_llm.llmapi.QuantConfig class method)</a>
 </li>
@@ -1814,6 +1892,8 @@
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.get_num_heads_kv">get_num_heads_kv() (tensorrt_llm.runtime.GenerationSession method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.Tensor.get_parent">get_parent() (tensorrt_llm.functional.Tensor method)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config">get_pytorch_backend_config() (tensorrt_llm.llmapi.TorchLlmArgs method)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.DisaggregatedParams.get_request_type">get_request_type() (tensorrt_llm.llmapi.DisaggregatedParams method)</a>
 </li>
@@ -1963,12 +2043,12 @@
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.int_clip">int_clip() (in module tensorrt_llm.functional)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.interpolate">interpolate() (in module tensorrt_llm.functional)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.PositionEmbeddingType.is_alibi">is_alibi() (tensorrt_llm.functional.PositionEmbeddingType method)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.PositionEmbeddingType.is_deferred">is_deferred() (tensorrt_llm.functional.PositionEmbeddingType method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.Tensor.is_dynamic">is_dynamic() (tensorrt_llm.functional.Tensor method)</a>
@@ -1978,18 +2058,24 @@
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.GemmaConfig.is_gemma_2">is_gemma_2 (tensorrt_llm.models.GemmaConfig property)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.GemmaConfig.is_gemma_3">is_gemma_3 (tensorrt_llm.models.GemmaConfig property)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.is_keep_all">is_keep_all (tensorrt_llm.llmapi.NGramDecodingConfig attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.is_medusa_mode">is_medusa_mode (tensorrt_llm.runtime.GenerationSession property)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.QuantConfig.is_module_excluded_from_quantization">is_module_excluded_from_quantization() (tensorrt_llm.llmapi.QuantConfig method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.PositionEmbeddingType.is_mrope">is_mrope() (tensorrt_llm.functional.PositionEmbeddingType method)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.is_public_pool">is_public_pool (tensorrt_llm.llmapi.NGramDecodingConfig attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.is_redrafter_mode">is_redrafter_mode (tensorrt_llm.runtime.GenerationSession property)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.PositionEmbeddingType.is_rope">is_rope() (tensorrt_llm.functional.PositionEmbeddingType method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.Tensor.is_trt_wrapper">is_trt_wrapper() (tensorrt_llm.functional.Tensor method)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.is_use_oldest">is_use_oldest (tensorrt_llm.llmapi.NGramDecodingConfig attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.attention.AttentionParams.is_valid">is_valid() (tensorrt_llm.layers.attention.AttentionParams method)</a>
 
@@ -2020,6 +2106,8 @@
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.attention.KeyValueCacheParams">KeyValueCacheParams (class in tensorrt_llm.layers.attention)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.kv_cache_dtype">kv_cache_dtype (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.QuantConfig.kv_cache_quant_algo">kv_cache_quant_algo (tensorrt_llm.llmapi.QuantConfig attribute)</a>
 </li>
@@ -2115,11 +2203,13 @@
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.LlavaNextVisionWrapper">LlavaNextVisionWrapper (class in tensorrt_llm.models)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.LLM">LLM (class in tensorrt_llm.llmapi)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.llm_engine_dir">llm_engine_dir (tensorrt_llm.runtime.MultimodalModelRunner property)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.LlmArgs">LlmArgs (in module tensorrt_llm.llmapi)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.PretrainedModel.load">load() (tensorrt_llm.models.PretrainedModel method)</a>
 
@@ -2127,6 +2217,8 @@
         <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.SD3Transformer2DModel.load">(tensorrt_llm.models.SD3Transformer2DModel method)</a>
 </li>
       </ul></li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.load_format">load_format (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.load_test_audio">load_test_audio() (tensorrt_llm.runtime.MultimodalModelRunner method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.load_test_data">load_test_data() (tensorrt_llm.runtime.MultimodalModelRunner method)</a>
@@ -2182,6 +2274,8 @@
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.low_latency_gemm">low_latency_gemm() (in module tensorrt_llm.functional)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.low_latency_gemm_swiglu">low_latency_gemm_swiglu() (in module tensorrt_llm.functional)</a>
+</li>
+      <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.AllReduceStrategy.LOWPRECISION">LOWPRECISION (tensorrt_llm.functional.AllReduceStrategy attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.lt">lt() (in module tensorrt_llm.functional)</a>
 </li>
@@ -2239,6 +2333,12 @@
 
       <ul>
         <li><a href="llm-api/reference.html#id8">(tensorrt_llm.llmapi.BuildCacheConfig property)</a>
+</li>
+      </ul></li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.max_cpu_loras">max_cpu_loras (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.max_cpu_loras">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
 </li>
       </ul></li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.BuildConfig.max_draft_len">max_draft_len (tensorrt_llm.llmapi.BuildConfig attribute)</a>
@@ -2248,6 +2348,20 @@
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.BuildConfig.max_encoder_input_len">max_encoder_input_len (tensorrt_llm.llmapi.BuildConfig attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.BuildConfig.max_input_len">max_input_len (tensorrt_llm.llmapi.BuildConfig attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.max_lora_rank">max_lora_rank (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.max_lora_rank">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
+</li>
+      </ul></li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.max_loras">max_loras (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.max_loras">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
+</li>
+      </ul></li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.max_matching_ngram_size">max_matching_ngram_size (tensorrt_llm.llmapi.NGramDecodingConfig attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.ModelConfig.max_medusa_tokens">max_medusa_tokens (tensorrt_llm.runtime.ModelConfig attribute)</a>
 </li>
@@ -2335,12 +2449,12 @@
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.min">min() (in module tensorrt_llm.functional)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.AllReduceStrategy.MIN_LATENCY">MIN_LATENCY (tensorrt_llm.functional.AllReduceStrategy attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.SamplingConfig.min_length">min_length (tensorrt_llm.runtime.SamplingConfig attribute)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams.min_p">min_p (tensorrt_llm.llmapi.SamplingParams attribute)</a>
 
       <ul>
@@ -2354,6 +2468,8 @@
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.activation.Mish">Mish (class in tensorrt_llm.layers.activation)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.QuantAlgo.MIXED_PRECISION">MIXED_PRECISION (tensorrt_llm.llmapi.QuantAlgo attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.mixed_sampler">mixed_sampler (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.MLLaMAForCausalLM">MLLaMAForCausalLM (class in tensorrt_llm.models)</a>
 </li>
@@ -2390,12 +2506,24 @@
         <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.MedusaDecodingConfig.model_config">(tensorrt_llm.llmapi.MedusaDecodingConfig attribute)</a>
 </li>
         <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig.model_config">(tensorrt_llm.llmapi.MTPDecodingConfig attribute)</a>
+</li>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.model_config">(tensorrt_llm.llmapi.NGramDecodingConfig attribute)</a>
 </li>
         <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.SchedulerConfig.model_config">(tensorrt_llm.llmapi.SchedulerConfig attribute)</a>
+</li>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.model_config">(tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.model_config">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
 </li>
       </ul></li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.ModelConfig.model_name">model_name (tensorrt_llm.runtime.ModelConfig attribute)</a>
 </li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.model_post_init">model_post_init() (tensorrt_llm.llmapi.TorchLlmArgs method)</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.model_post_init">(tensorrt_llm.llmapi.TrtLlmArgs method)</a>
+</li>
+      </ul></li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.ModelConfig">ModelConfig (class in tensorrt_llm.runtime)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.ModelRunner">ModelRunner (class in tensorrt_llm.runtime)</a>
@@ -2442,6 +2570,12 @@
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.SideStreamIDType.moe">moe (tensorrt_llm.functional.SideStreamIDType attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.AllReduceFusionOp.MOE_ALLREDUCE_RESIDUAL_RMS_NORM">MOE_ALLREDUCE_RESIDUAL_RMS_NORM (tensorrt_llm.functional.AllReduceFusionOp attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.moe_backend">moe_backend (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.moe_load_balancer">moe_load_balancer (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.moe_max_num_tokens">moe_max_num_tokens (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.BuildConfig.monitor_memory">monitor_memory (tensorrt_llm.llmapi.BuildConfig attribute)</a>
 </li>
@@ -2459,6 +2593,12 @@
       </ul></li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.attention.MropeParams">MropeParams (class in tensorrt_llm.layers.attention)</a>
 </li>
+      <li><a href="llm-api/reference.html#id10">msg (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>, <a href="llm-api/reference.html#id13">[1]</a>, <a href="llm-api/reference.html#id16">[2]</a>, <a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.msg">[3]</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#id19">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>, <a href="llm-api/reference.html#id22">[1]</a>, <a href="llm-api/reference.html#id25">[2]</a>, <a href="llm-api/reference.html#id28">[3]</a>, <a href="llm-api/reference.html#id31">[4]</a>, <a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.msg">[5]</a>
+</li>
+      </ul></li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.MTPDecodingConfig">MTPDecodingConfig (class in tensorrt_llm.llmapi)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.mul">mul() (in module tensorrt_llm.functional)</a>
@@ -2498,6 +2638,10 @@
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.Tensor.network">network (tensorrt_llm.functional.Tensor property)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.next_medusa_input_ids">next_medusa_input_ids() (tensorrt_llm.runtime.GenerationSession method)</a>
+</li>
+      <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.SpeculativeDecodingMode.NGRAM">NGRAM (tensorrt_llm.models.SpeculativeDecodingMode attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig">NGramDecodingConfig (class in tensorrt_llm.llmapi)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.QuantAlgo.NO_QUANT">NO_QUANT (tensorrt_llm.llmapi.QuantAlgo attribute)</a>
 </li>
@@ -2520,11 +2664,11 @@
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.not_op">not_op() (in module tensorrt_llm.functional)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.SamplingConfig.num_beams">num_beams (tensorrt_llm.runtime.SamplingConfig attribute)</a>
-</li>
-      <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.num_draft_tokens">num_draft_tokens (tensorrt_llm.runtime.GenerationSession attribute)</a>
 </li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.num_draft_tokens">num_draft_tokens (tensorrt_llm.runtime.GenerationSession attribute)</a>
+</li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.EagleDecodingConfig.num_eagle_layers">num_eagle_layers (tensorrt_llm.llmapi.EagleDecodingConfig attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.num_heads">num_heads (tensorrt_llm.runtime.GenerationSession property)</a>
@@ -2734,6 +2878,8 @@
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.PretrainedConfig">PretrainedConfig (class in tensorrt_llm.models)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.PretrainedModel">PretrainedModel (class in tensorrt_llm.models)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.print_iter_log">print_iter_log (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.priority">priority (tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig property)</a>
 </li>
@@ -2759,6 +2905,8 @@
         <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams.prompt_logprobs">(tensorrt_llm.llmapi.SamplingParams attribute)</a>
 </li>
       </ul></li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.NGramDecodingConfig.prompt_lookup_num_tokens">prompt_lookup_num_tokens (tensorrt_llm.llmapi.NGramDecodingConfig attribute)</a>
+</li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.RequestOutput.prompt_token_ids">prompt_token_ids (tensorrt_llm.llmapi.RequestOutput attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.embedding.PromptTuningEmbedding">PromptTuningEmbedding (class in tensorrt_llm.layers.embedding)</a>
@@ -3228,7 +3376,7 @@
         <li><a href="python-api/tensorrt_llm.functional.html#module-tensorrt_llm">module</a>, <a href="python-api/tensorrt_llm.layers.html#module-tensorrt_llm">[1]</a>, <a href="python-api/tensorrt_llm.models.html#module-tensorrt_llm">[2]</a>, <a href="python-api/tensorrt_llm.plugin.html#module-tensorrt_llm">[3]</a>, <a href="python-api/tensorrt_llm.quantization.html#module-tensorrt_llm">[4]</a>, <a href="python-api/tensorrt_llm.runtime.html#module-tensorrt_llm">[5]</a>
 </li>
       </ul></li>
-      <li><a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">tensorrt_llm (C++ type)</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[3]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[4]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[5]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[6]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[7]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[8]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[9]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[10]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[11]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[12]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[13]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[14]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[15]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[16]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[17]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[18]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[19]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[20]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[21]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[22]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[23]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[24]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[25]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[26]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[27]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[28]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[29]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[30]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[31]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[32]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[33]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[34]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[35]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[36]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[37]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[38]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[39]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[40]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[41]</a>
+      <li><a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">tensorrt_llm (C++ type)</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[3]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[4]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[5]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[6]</a>, <a href="_cpp_gen/executor.html#_CPPv412tensorrt_llm">[7]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[8]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[9]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[10]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[11]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[12]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[13]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[14]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[15]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[16]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[17]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[18]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[19]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[20]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[21]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[22]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[23]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[24]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[25]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[26]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[27]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[28]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[29]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[30]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[31]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[32]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[33]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[34]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[35]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[36]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[37]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[38]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[39]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[40]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[41]</a>, <a href="_cpp_gen/runtime.html#_CPPv412tensorrt_llm">[42]</a>
 </li>
       <li>
     tensorrt_llm.functional
@@ -3328,11 +3476,11 @@
         <li><a href="python-api/tensorrt_llm.runtime.html#module-tensorrt_llm.runtime">module</a>
 </li>
       </ul></li>
-      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm13batch_managerE">tensorrt_llm::batch_manager (C++ type)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm13batch_managerE">[1]</a>, <a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm13batch_managerE">[2]</a>, <a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm13batch_managerE">[3]</a>, <a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm13batch_managerE">[4]</a>, <a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm13batch_managerE">[5]</a>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm13batch_managerE">tensorrt_llm::batch_manager (C++ type)</a>, <a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm13batch_managerE">[1]</a>, <a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm13batch_managerE">[2]</a>, <a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm13batch_managerE">[3]</a>, <a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm13batch_managerE">[4]</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm13batch_manager16kv_cache_managerE">tensorrt_llm::batch_manager::kv_cache_manager (C++ type)</a>
 </li>
-      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">tensorrt_llm::executor (C++ type)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[3]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[4]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[5]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[6]</a>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">tensorrt_llm::executor (C++ type)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[3]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[4]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[5]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[6]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executorE">[7]</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor21AdditionalModelOutputE">tensorrt_llm::executor::AdditionalModelOutput (C++ class)</a>
 </li>
@@ -4239,6 +4387,8 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor14IterationStats17numQueuedRequestsE">tensorrt_llm::executor::IterationStats::numQueuedRequests (C++ member)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor14IterationStats14pinnedMemUsageE">tensorrt_llm::executor::IterationStats::pinnedMemUsage (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor14IterationStats12specDecStatsE">tensorrt_llm::executor::IterationStats::specDecStats (C++ member)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor14IterationStats19staticBatchingStatsE">tensorrt_llm::executor::IterationStats::staticBatchingStats (C++ member)</a>
 </li>
@@ -4250,7 +4400,59 @@
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK12RequestStats">tensorrt_llm::executor::JsonSerialization::toJsonStr (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK14IterationStats">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK24RequestStatsPerIteration">[2]</a>
 </li>
-      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cacheE">tensorrt_llm::executor::kv_cache (C++ type)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cacheE">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cacheE">[2]</a>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cacheE">tensorrt_llm::executor::kv_cache (C++ type)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cacheE">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cacheE">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cacheE">[3]</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDescE">tensorrt_llm::executor::kv_cache::AgentDesc (C++ class)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc9AgentDescENSt6stringE">tensorrt_llm::executor::kv_cache::AgentDesc::AgentDesc (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache9AgentDesc19getBackendAgentDescEv">tensorrt_llm::executor::kv_cache::AgentDesc::getBackendAgentDesc (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc17mBackendAgentDescE">tensorrt_llm::executor::kv_cache::AgentDesc::mBackendAgentDesc (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE">tensorrt_llm::executor::kv_cache::AgentState (C++ struct)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE">tensorrt_llm::executor::kv_cache::AgentState::AgentState (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateEv">[1]</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10mAgentNameE">tensorrt_llm::executor::kv_cache::AgentState::mAgentName (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState15mConnectionInfoE">tensorrt_llm::executor::kv_cache::AgentState::mConnectionInfo (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentStateeqERK10AgentState">tensorrt_llm::executor::kv_cache::AgentState::operator== (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentState8toStringEv">tensorrt_llm::executor::kv_cache::AgentState::toString (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfigE">tensorrt_llm::executor::kv_cache::BaseAgentConfig (C++ struct)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig5mNameE">tensorrt_llm::executor::kv_cache::BaseAgentConfig::mName (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig13useProgThreadE">tensorrt_llm::executor::kv_cache::BaseAgentConfig::useProgThread (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentE">tensorrt_llm::executor::kv_cache::BaseTransferAgent (C++ class)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs">tensorrt_llm::executor::kv_cache::BaseTransferAgent::checkRemoteDescs (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType">tensorrt_llm::executor::kv_cache::BaseTransferAgent::connectRemoteAgent (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16deregisterMemoryERK13RegisterDescs">tensorrt_llm::executor::kv_cache::BaseTransferAgent::deregisterMemory (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getConnectionInfoEv">tensorrt_llm::executor::kv_cache::BaseTransferAgent::getConnectionInfo (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getLocalAgentDescEv">tensorrt_llm::executor::kv_cache::BaseTransferAgent::getLocalAgentDesc (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent23getNotifiedSyncMessagesEv">tensorrt_llm::executor::kv_cache::BaseTransferAgent::getNotifiedSyncMessages (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent21invalidateRemoteAgentERKNSt6stringE">tensorrt_llm::executor::kv_cache::BaseTransferAgent::invalidateRemoteAgent (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc">tensorrt_llm::executor::kv_cache::BaseTransferAgent::loadRemoteAgent (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage">tensorrt_llm::executor::kv_cache::BaseTransferAgent::notifySyncMessage (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent14registerMemoryERK13RegisterDescs">tensorrt_llm::executor::kv_cache::BaseTransferAgent::registerMemory (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent22submitTransferRequestsERK15TransferRequest">tensorrt_llm::executor::kv_cache::BaseTransferAgent::submitTransferRequests (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentD0Ev">tensorrt_llm::executor::kv_cache::BaseTransferAgent::~BaseTransferAgent (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10CacheStateE">tensorrt_llm::executor::kv_cache::CacheState (C++ class)</a>
 </li>
@@ -4316,13 +4518,17 @@
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommStateE">tensorrt_llm::executor::kv_cache::CommState (C++ class)</a>
 </li>
-      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi">tensorrt_llm::executor::kv_cache::CommState::CommState (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateEv">[3]</a>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi">tensorrt_llm::executor::kv_cache::CommState::CommState (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE">[3]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateEv">[4]</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13getAgentStateEv">tensorrt_llm::executor::kv_cache::CommState::getAgentState (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv">tensorrt_llm::executor::kv_cache::CommState::getMpiState (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv">tensorrt_llm::executor::kv_cache::CommState::getSelfIdx (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState14getSocketStateEv">tensorrt_llm::executor::kv_cache::CommState::getSocketState (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState12isAgentStateEv">tensorrt_llm::executor::kv_cache::CommState::isAgentState (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv">tensorrt_llm::executor::kv_cache::CommState::isMpiState (C++ function)</a>
 </li>
@@ -4345,6 +4551,8 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t">tensorrt_llm::executor::kv_cache::Connection::send (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10ConnectionD0Ev">tensorrt_llm::executor::kv_cache::Connection::~Connection (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE">tensorrt_llm::executor::kv_cache::ConnectionInfoType (C++ type)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManagerE">tensorrt_llm::executor::kv_cache::ConnectionManager (C++ class)</a>
 </li>
@@ -4363,6 +4571,74 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache11DataContext6getTagEv">tensorrt_llm::executor::kv_cache::DataContext::getTag (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext4mTagE">tensorrt_llm::executor::kv_cache::DataContext::mTag (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderE">tensorrt_llm::executor::kv_cache::DynLibLoader (C++ class)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc">tensorrt_llm::executor::kv_cache::DynLibLoader::dlSym (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderERK12DynLibLoader">tensorrt_llm::executor::kv_cache::DynLibLoader::DynLibLoader (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderEv">[1]</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE">tensorrt_llm::executor::kv_cache::DynLibLoader::getFunctionPointer (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9getHandleERKNSt6stringE">tensorrt_llm::executor::kv_cache::DynLibLoader::getHandle (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader11getInstanceEv">tensorrt_llm::executor::kv_cache::DynLibLoader::getInstance (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mDllMutexE">tensorrt_llm::executor::kv_cache::DynLibLoader::mDllMutex (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mHandlersE">tensorrt_llm::executor::kv_cache::DynLibLoader::mHandlers (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderaSERK12DynLibLoader">tensorrt_llm::executor::kv_cache::DynLibLoader::operator= (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderD0Ev">tensorrt_llm::executor::kv_cache::DynLibLoader::~DynLibLoader (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args">tensorrt_llm::executor::kv_cache::makeTransferAgent (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE">tensorrt_llm::executor::kv_cache::MemoryDesc (C++ class)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc11deserializeERNSt7istreamE">tensorrt_llm::executor::kv_cache::MemoryDesc::deserialize (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc7getAddrEv">tensorrt_llm::executor::kv_cache::MemoryDesc::getAddr (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc11getDeviceIdEv">tensorrt_llm::executor::kv_cache::MemoryDesc::getDeviceId (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc6getLenEv">tensorrt_llm::executor::kv_cache::MemoryDesc::getLen (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc5mAddrE">tensorrt_llm::executor::kv_cache::MemoryDesc::mAddr (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9mDeviceIdE">tensorrt_llm::executor::kv_cache::MemoryDesc::mDeviceId (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t">tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t">[2]</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc4mLenE">tensorrt_llm::executor::kv_cache::MemoryDesc::mLen (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE">tensorrt_llm::executor::kv_cache::MemoryDesc::serialize (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc14serializedSizeERK10MemoryDesc">tensorrt_llm::executor::kv_cache::MemoryDesc::serializedSize (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescsE">tensorrt_llm::executor::kv_cache::MemoryDescs (C++ class)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs8getDescsEv">tensorrt_llm::executor::kv_cache::MemoryDescs::getDescs (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs7getTypeEv">tensorrt_llm::executor::kv_cache::MemoryDescs::getType (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs6mDescsE">tensorrt_llm::executor::kv_cache::MemoryDescs::mDescs (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE">tensorrt_llm::executor::kv_cache::MemoryDescs::MemoryDescs (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs5mTypeE">tensorrt_llm::executor::kv_cache::MemoryDescs::mType (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryTypeE">tensorrt_llm::executor::kv_cache::MemoryType (C++ enum)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kBLKE">tensorrt_llm::executor::kv_cache::MemoryType::kBLK (C++ enumerator)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kDRAME">tensorrt_llm::executor::kv_cache::MemoryType::kDRAM (C++ enumerator)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kFILEE">tensorrt_llm::executor::kv_cache::MemoryType::kFILE (C++ enumerator)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kOBJE">tensorrt_llm::executor::kv_cache::MemoryType::kOBJ (C++ enumerator)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kVRAME">tensorrt_llm::executor::kv_cache::MemoryType::kVRAM (C++ enumerator)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache8MpiStateE">tensorrt_llm::executor::kv_cache::MpiState (C++ struct)</a>
 </li>
@@ -4371,6 +4647,8 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiStateeqERK8MpiState">tensorrt_llm::executor::kv_cache::MpiState::operator== (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiState8toStringEv">tensorrt_llm::executor::kv_cache::MpiState::toString (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache13RegisterDescsE">tensorrt_llm::executor::kv_cache::RegisterDescs (C++ type)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache11SocketStateE">tensorrt_llm::executor::kv_cache::SocketState (C++ struct)</a>
 </li>
@@ -4381,6 +4659,48 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketStateeqERK11SocketState">tensorrt_llm::executor::kv_cache::SocketState::operator== (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketState8toStringEv">tensorrt_llm::executor::kv_cache::SocketState::toString (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE">tensorrt_llm::executor::kv_cache::SyncMessage (C++ type)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE">tensorrt_llm::executor::kv_cache::TransferDescs (C++ type)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOpE">tensorrt_llm::executor::kv_cache::TransferOp (C++ enum)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp5kREADE">tensorrt_llm::executor::kv_cache::TransferOp::kREAD (C++ enumerator)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp6kWRITEE">tensorrt_llm::executor::kv_cache::TransferOp::kWRITE (C++ enumerator)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequestE">tensorrt_llm::executor::kv_cache::TransferRequest (C++ class)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getDstDescsEv">tensorrt_llm::executor::kv_cache::TransferRequest::getDstDescs (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest5getOpEv">tensorrt_llm::executor::kv_cache::TransferRequest::getOp (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest13getRemoteNameEv">tensorrt_llm::executor::kv_cache::TransferRequest::getRemoteName (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getSrcDescsEv">tensorrt_llm::executor::kv_cache::TransferRequest::getSrcDescs (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest14getSyncMessageEv">tensorrt_llm::executor::kv_cache::TransferRequest::getSyncMessage (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mDstDescsE">tensorrt_llm::executor::kv_cache::TransferRequest::mDstDescs (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest3mOpE">tensorrt_llm::executor::kv_cache::TransferRequest::mOp (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest11mRemoteNameE">tensorrt_llm::executor::kv_cache::TransferRequest::mRemoteName (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mSrcDescsE">tensorrt_llm::executor::kv_cache::TransferRequest::mSrcDescs (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest12mSyncMessageE">tensorrt_llm::executor::kv_cache::TransferRequest::mSyncMessage (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE">tensorrt_llm::executor::kv_cache::TransferRequest::TransferRequest (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusE">tensorrt_llm::executor::kv_cache::TransferStatus (C++ class)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus11isCompletedEv">tensorrt_llm::executor::kv_cache::TransferStatus::isCompleted (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus4waitEv">tensorrt_llm::executor::kv_cache::TransferStatus::wait (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusD0Ev">tensorrt_llm::executor::kv_cache::TransferStatus::~TransferStatus (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13KvCacheConfigE">tensorrt_llm::executor::KvCacheConfig (C++ class)</a>
 </li>
@@ -4497,10 +4817,14 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig19getDecodeDurationMsEv">tensorrt_llm::executor::KvCacheRetentionConfig::getDecodeDurationMs (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig26getDecodeRetentionPriorityEv">tensorrt_llm::executor::KvCacheRetentionConfig::getDecodeRetentionPriority (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig12getDirectoryEv">tensorrt_llm::executor::KvCacheRetentionConfig::getDirectory (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32">tensorrt_llm::executor::KvCacheRetentionConfig::getPerBlockRetentionPriorityDuration (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig29getTokenRangeRetentionConfigsEv">tensorrt_llm::executor::KvCacheRetentionConfig::getTokenRangeRetentionConfigs (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig15getTransferModeEv">tensorrt_llm::executor::KvCacheRetentionConfig::getTransferMode (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25kDefaultRetentionPriorityE">tensorrt_llm::executor::KvCacheRetentionConfig::kDefaultRetentionPriority (C++ member)</a>
 </li>
@@ -4508,13 +4832,17 @@
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMinRetentionPriorityE">tensorrt_llm::executor::KvCacheRetentionConfig::kMinRetentionPriority (C++ member)</a>
 </li>
-      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE">tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigEv">[1]</a>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE">tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigEv">[1]</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig17mDecodeDurationMsE">tensorrt_llm::executor::KvCacheRetentionConfig::mDecodeDurationMs (C++ member)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig24mDecodeRetentionPriorityE">tensorrt_llm::executor::KvCacheRetentionConfig::mDecodeRetentionPriority (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig10mDirectoryE">tensorrt_llm::executor::KvCacheRetentionConfig::mDirectory (C++ member)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig27mTokenRangeRetentionConfigsE">tensorrt_llm::executor::KvCacheRetentionConfig::mTokenRangeRetentionConfigs (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig13mTransferModeE">tensorrt_llm::executor::KvCacheRetentionConfig::mTransferMode (C++ member)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfigeqERK22KvCacheRetentionConfig">tensorrt_llm::executor::KvCacheRetentionConfig::operator== (C++ function)</a>
 </li>
@@ -4571,6 +4899,14 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17KVCacheStoredData6blocksE">tensorrt_llm::executor::KVCacheStoredData::blocks (C++ member)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17KVCacheStoredData10parentHashE">tensorrt_llm::executor::KVCacheStoredData::parentHash (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE">tensorrt_llm::executor::KvCacheTransferMode (C++ enum)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode4DRAME">tensorrt_llm::executor::KvCacheTransferMode::DRAM (C++ enumerator)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode3GDSE">tensorrt_llm::executor::KvCacheTransferMode::GDS (C++ enumerator)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode20POSIX_DEBUG_FALLBACKE">tensorrt_llm::executor::KvCacheTransferMode::POSIX_DEBUG_FALLBACK (C++ enumerator)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedDataE">tensorrt_llm::executor::KVCacheUpdatedData (C++ struct)</a>
 </li>
@@ -5367,6 +5703,8 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization32deserializeAdditionalModelOutputERNSt7istreamE">tensorrt_llm::executor::Serialization::deserializeAdditionalModelOutput (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization27deserializeAdditionalOutputERNSt7istreamE">tensorrt_llm::executor::Serialization::deserializeAdditionalOutput (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization21deserializeAgentStateERNSt7istreamE">tensorrt_llm::executor::Serialization::deserializeAgentState (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization15deserializeBoolERNSt7istreamE">tensorrt_llm::executor::Serialization::deserializeBool (C++ function)</a>
 </li>
@@ -5457,6 +5795,8 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization22deserializeSocketStateERNSt7istreamE">tensorrt_llm::executor::Serialization::deserializeSocketState (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization32deserializeSpecDecFastLogitsInfoERNSt7istreamE">tensorrt_llm::executor::Serialization::deserializeSpecDecFastLogitsInfo (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization28deserializeSpecDecodingStatsERNSt7istreamE">tensorrt_llm::executor::Serialization::deserializeSpecDecodingStats (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization36deserializeSpeculativeDecodingConfigERNSt7istreamE">tensorrt_llm::executor::Serialization::deserializeSpeculativeDecodingConfig (C++ function)</a>
 </li>
@@ -5470,9 +5810,9 @@
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization36deserializeTokenRangeRetentionConfigERNSt7istreamE">tensorrt_llm::executor::Serialization::deserializeTokenRangeRetentionConfig (C++ function)</a>
 </li>
-      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE">tensorrt_llm::executor::Serialization::serialize (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE">[3]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE">[4]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE">[5]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE">[6]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE">[7]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE">[8]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE">[9]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE">[10]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE">[11]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStats">[12]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE">[13]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE">[14]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE">[15]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE">[16]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE">[17]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE">[18]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE">[19]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE">[20]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE">[21]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE">[22]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE">[23]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE">[24]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverState">[25]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE">[26]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE">[27]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE">[28]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE">[29]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE">[30]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE">[31]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE">[32]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE">[33]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE">[34]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIteration">[35]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE">[36]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE">[37]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE">[38]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE">[39]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE">[40]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE">[41]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE">[42]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE">[43]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE">[44]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE">[45]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE">[46]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE">[47]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE">[48]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE">[49]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI14IterationStatsEE">[50]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI24RequestStatsPerIterationEE">[51]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI8ResponseEE">[52]</a>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE">tensorrt_llm::executor::Serialization::serialize (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE">[3]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE">[4]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE">[5]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE">[6]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE">[7]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE">[8]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE">[9]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE">[10]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE">[11]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStats">[12]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE">[13]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE">[14]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE">[15]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE">[16]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE">[17]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE">[18]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE">[19]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE">[20]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE">[21]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE">[22]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE">[23]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE">[24]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE">[25]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverState">[26]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE">[27]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE">[28]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE">[29]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE">[30]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE">[31]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE">[32]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE">[33]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE">[34]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE">[35]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIteration">[36]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE">[37]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE">[38]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE">[39]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE">[40]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE">[41]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE">[42]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE">[43]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE">[44]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE">[45]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE">[46]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE">[47]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE">[48]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE">[49]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE">[50]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE">[51]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI14IterationStatsEE">[52]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI24RequestStatsPerIterationEE">[53]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI8ResponseEE">[54]</a>
 </li>
-      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK10LoraConfig">tensorrt_llm::executor::Serialization::serializedSize (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11DebugConfig">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11EagleConfig">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11MropeConfig">[3]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12DecodingMode">[4]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12KvCacheStats">[5]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12OutputConfig">[6]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStage">[7]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStats">[8]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK13KvCacheConfig">[9]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14DecodingConfig">[10]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ExecutorConfig">[11]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14IterationStats">[12]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ParallelConfig">[13]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14SamplingConfig">[14]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15PeftCacheConfig">[15]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15SchedulerConfig">[16]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK16AdditionalOutput">[17]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18ContextPhaseParams">[18]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18DynamicBatchConfig">[19]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18OrchestratorConfig">[20]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18PromptTuningConfig">[21]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18RequestPerfMetrics">[22]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK19StaticBatchingStats">[23]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20DataTransceiverState">[24]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingConfig">[25]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingParams">[26]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21AdditionalModelOutput">[27]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats">[28]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22CacheTransceiverConfig">[29]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22DisServingRequestStats">[30]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22KvCacheRetentionConfig">[31]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK23LookaheadDecodingConfig">[32]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK24RequestStatsPerIteration">[33]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25ExternalDraftTokensConfig">[34]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25SpeculativeDecodingConfig">[35]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK29ExtendedRuntimePerfKnobConfig">[36]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK33SpeculativeDecodingFastLogitsInfo">[37]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Result">[38]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Tensor">[39]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK7Request">[40]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK8Response">[41]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN18RequestPerfMetrics9TimePointE">[42]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigE">[43]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10CacheStateE">[44]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE">[45]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache9CommStateE">[46]</a>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK10LoraConfig">tensorrt_llm::executor::Serialization::serializedSize (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11DebugConfig">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11EagleConfig">[2]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11MropeConfig">[3]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12DecodingMode">[4]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12KvCacheStats">[5]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12OutputConfig">[6]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStage">[7]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStats">[8]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK13KvCacheConfig">[9]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14DecodingConfig">[10]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ExecutorConfig">[11]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14IterationStats">[12]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ParallelConfig">[13]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14SamplingConfig">[14]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15PeftCacheConfig">[15]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15SchedulerConfig">[16]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK16AdditionalOutput">[17]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK17SpecDecodingStats">[18]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18ContextPhaseParams">[19]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18DynamicBatchConfig">[20]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18OrchestratorConfig">[21]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18PromptTuningConfig">[22]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18RequestPerfMetrics">[23]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK19StaticBatchingStats">[24]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20DataTransceiverState">[25]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingConfig">[26]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingParams">[27]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21AdditionalModelOutput">[28]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats">[29]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22CacheTransceiverConfig">[30]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22DisServingRequestStats">[31]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22KvCacheRetentionConfig">[32]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK23LookaheadDecodingConfig">[33]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK24RequestStatsPerIteration">[34]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25ExternalDraftTokensConfig">[35]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25SpeculativeDecodingConfig">[36]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK29ExtendedRuntimePerfKnobConfig">[37]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK33SpeculativeDecodingFastLogitsInfo">[38]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Result">[39]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Tensor">[40]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK7Request">[41]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK8Response">[42]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN18RequestPerfMetrics9TimePointE">[43]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigE">[44]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10AgentStateE">[45]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10CacheStateE">[46]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE">[47]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache9CommStateE">[48]</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor5ShapeE">tensorrt_llm::executor::Shape (C++ class)</a>
 </li>
@@ -5483,6 +5823,22 @@
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor5Shape5ShapeENSt16initializer_listI9DimType64EE">tensorrt_llm::executor::Shape::Shape (C++ function)</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor5Shape5ShapeEPK9DimType64N4Base9size_typeE">[1]</a>, <a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor5Shape5ShapeEv">[2]</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor10SizeType32E">tensorrt_llm::executor::SizeType32 (C++ type)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor10SizeType64E">tensorrt_llm::executor::SizeType64 (C++ type)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE">tensorrt_llm::executor::SpecDecodingStats (C++ struct)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats16acceptanceLengthE">tensorrt_llm::executor::SpecDecodingStats::acceptanceLength (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13draftOverheadE">tensorrt_llm::executor::SpecDecodingStats::draftOverhead (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13iterLatencyMSE">tensorrt_llm::executor::SpecDecodingStats::iterLatencyMS (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats17numAcceptedTokensE">tensorrt_llm::executor::SpecDecodingStats::numAcceptedTokens (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats14numDraftTokensE">tensorrt_llm::executor::SpecDecodingStats::numDraftTokens (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor17SpecDecodingStats26numRequestsWithDraftTokensE">tensorrt_llm::executor::SpecDecodingStats::numRequestsWithDraftTokens (C++ member)</a>
 </li>
       <li><a href="_cpp_gen/executor.html#_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfigE">tensorrt_llm::executor::SpeculativeDecodingConfig (C++ class)</a>
 </li>
@@ -5637,6 +5993,8 @@
       <li><a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb">tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE">tensorrt_llm::runtime::AllReduceBuffers::mAllReduceCommPtrs (C++ member)</a>
+</li>
+      <li><a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE">tensorrt_llm::runtime::AllReduceBuffers::mFlagPtrs (C++ member)</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE">tensorrt_llm::runtime::AllReduceBuffers::mIpcMemoryHandles (C++ member)</a>
 </li>
@@ -5750,6 +6108,8 @@
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter10mOwnsEventE">tensorrt_llm::runtime::CudaEvent::Deleter::mOwnsEvent (C++ member)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime9CudaEvent7DeleterclE7pointer">tensorrt_llm::runtime::CudaEvent::Deleter::operator() (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime9CudaEvent12element_typeE">tensorrt_llm::runtime::CudaEvent::element_type (C++ type)</a>
@@ -5846,8 +6206,6 @@
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4nameE">tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;::name (C++ member)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4sizeE">tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;::size (C++ member)</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4typeE">tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;::type (C++ type)</a>
@@ -5927,6 +6285,8 @@
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getJointDecodingOutputEv">tensorrt_llm::runtime::decoder::DecoderState::getJointDecodingOutput (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsE10SizeType32">tensorrt_llm::runtime::decoder::DecoderState::getLogProbs (C++ function)</a>, <a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsEv">[1]</a>
+</li>
+      <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv">tensorrt_llm::runtime::decoder::DecoderState::getMaxBatchSize (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv">tensorrt_llm::runtime::decoder::DecoderState::getMaxBeamWidth (C++ function)</a>
 </li>
@@ -5946,7 +6306,7 @@
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getPrevDraftTokensLengthsEv">tensorrt_llm::runtime::decoder::DecoderState::getPrevDraftTokensLengths (C++ function)</a>
 </li>
-      <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsEv">tensorrt_llm::runtime::decoder::DecoderState::getSequenceLengths (C++ function)</a>
+      <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32">tensorrt_llm::runtime::decoder::DecoderState::getSequenceLengths (C++ function)</a>, <a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsEv">[1]</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getSpeculativeDecodingModeEv">tensorrt_llm::runtime::decoder::DecoderState::getSpeculativeDecodingMode (C++ function)</a>
 </li>
@@ -7471,6 +7831,8 @@
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getDataTypeEv">tensorrt_llm::runtime::ModelConfig::getDataType (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getEncoderHiddenSizeEv">tensorrt_llm::runtime::ModelConfig::getEncoderHiddenSize (C++ function)</a>
+</li>
+      <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32">tensorrt_llm::runtime::ModelConfig::getFirstLocalLayer (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getGemmAllReduceDtypeEv">tensorrt_llm::runtime::ModelConfig::getGemmAllReduceDtype (C++ function)</a>
 </li>
@@ -7522,7 +7884,7 @@
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getNbKvHeadsE10SizeType32">tensorrt_llm::runtime::ModelConfig::getNbKvHeads (C++ function)</a>
 </li>
-      <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32">tensorrt_llm::runtime::ModelConfig::getNbLayers (C++ function)</a>
+      <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32">tensorrt_llm::runtime::ModelConfig::getNbLayers (C++ function)</a>
 </li>
       <li><a href="_cpp_gen/runtime.html#_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32">tensorrt_llm::runtime::ModelConfig::getNbRnnLayers (C++ function)</a>
 </li>
@@ -8377,6 +8739,18 @@
 </li>
       </ul></li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.topk">topk() (in module tensorrt_llm.functional)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enable_userbuffers">torch_compile_enable_userbuffers (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enabled">torch_compile_enabled (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_fullgraph">torch_compile_fullgraph (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_inductor_enabled">torch_compile_inductor_enabled (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_piecewise_cuda_graph">torch_compile_piecewise_cuda_graph (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs">TorchLlmArgs (class in tensorrt_llm.llmapi)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.linear.Linear.tp_split_dim">tp_split_dim() (tensorrt_llm.layers.linear.Linear class method)</a>
 
@@ -8386,6 +8760,8 @@
         <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.linear.RowLinear.tp_split_dim">(tensorrt_llm.layers.linear.RowLinear class method)</a>
 </li>
       </ul></li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.KvCacheRetentionConfig.transfer_mode">transfer_mode (tensorrt_llm.llmapi.KvCacheRetentionConfig property)</a>
+</li>
       <li><a href="python-api/tensorrt_llm.functional.html#tensorrt_llm.functional.transpose">transpose() (in module tensorrt_llm.functional)</a>
 
       <ul>
@@ -8466,6 +8842,8 @@
 </li>
       </ul></li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.ModelConfig.trtllm_modules_to_hf_modules">trtllm_modules_to_hf_modules (tensorrt_llm.runtime.ModelConfig attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs">TrtLlmArgs (class in tensorrt_llm.llmapi)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams.truncate_prompt_tokens">truncate_prompt_tokens (tensorrt_llm.llmapi.SamplingParams attribute)</a>
 </li>
@@ -8514,6 +8892,8 @@
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.SamplingConfig.use_beam_hyps">use_beam_hyps (tensorrt_llm.runtime.SamplingConfig attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.SamplingParams.use_beam_search">use_beam_search (tensorrt_llm.llmapi.SamplingParams attribute)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.use_cuda_graph">use_cuda_graph (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.EagleDecodingConfig.use_dynamic_tree">use_dynamic_tree (tensorrt_llm.llmapi.EagleDecodingConfig attribute)</a>
 </li>
@@ -8523,8 +8903,12 @@
   <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.use_gpt_attention_plugin">use_gpt_attention_plugin (tensorrt_llm.runtime.GenerationSession property)</a>
 </li>
-      <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.use_kv_cache">use_kv_cache (tensorrt_llm.runtime.GenerationSession property)</a>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.use_kv_cache">use_kv_cache (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>
+
+      <ul>
+        <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.use_kv_cache">(tensorrt_llm.runtime.GenerationSession property)</a>
 </li>
+      </ul></li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.DecoderModel.use_lora">use_lora() (tensorrt_llm.models.DecoderModel method)</a>
 
       <ul>
@@ -8569,6 +8953,10 @@
 <h2 id="V">V</h2>
 <table style="width: 100%" class="indextable genindextable"><tr>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_config">validate_cuda_graph_config() (tensorrt_llm.llmapi.TorchLlmArgs method)</a>
+</li>
+      <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_max_batch_size">validate_cuda_graph_max_batch_size() (tensorrt_llm.llmapi.TorchLlmArgs class method)</a>
+</li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values">validate_positive_values() (tensorrt_llm.llmapi.LookaheadDecodingConfig class method)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.models.html#tensorrt_llm.models.GemmaConfig.VERBATIM">VERBATIM (tensorrt_llm.models.GemmaConfig attribute)</a>
@@ -8583,10 +8971,10 @@
         <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.TensorInfo.view">(tensorrt_llm.runtime.TensorInfo method)</a>
 </li>
       </ul></li>
-      <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.visual_engine_dir">visual_engine_dir (tensorrt_llm.runtime.MultimodalModelRunner property)</a>
-</li>
   </ul></td>
   <td style="width: 33%; vertical-align: top;"><ul>
+      <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.MultimodalModelRunner.visual_engine_dir">visual_engine_dir (tensorrt_llm.runtime.MultimodalModelRunner property)</a>
+</li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.BuildConfig.visualize_network">visualize_network (tensorrt_llm.llmapi.BuildConfig attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.runtime.html#tensorrt_llm.runtime.GenerationSession.vocab_size">vocab_size (tensorrt_llm.runtime.GenerationSession property)</a>
@@ -8633,10 +9021,10 @@
 </li>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN">W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN (tensorrt_llm.llmapi.QuantAlgo attribute)</a>
 </li>
-  </ul></td>
-  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN">W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN (tensorrt_llm.llmapi.QuantAlgo attribute)</a>
 </li>
+  </ul></td>
+  <td style="width: 33%; vertical-align: top;"><ul>
       <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN">W8A8_SQ_PER_TENSOR_PLUGIN (tensorrt_llm.llmapi.QuantAlgo attribute)</a>
 </li>
       <li><a href="python-api/tensorrt_llm.layers.html#tensorrt_llm.layers.attention.DeepseekV2Attention.weight_loader">weight_loader() (tensorrt_llm.layers.attention.DeepseekV2Attention method)</a>
@@ -8659,6 +9047,14 @@
 
       <ul>
         <li><a href="llm-api/reference.html#id1">(tensorrt_llm.llmapi.LLM property)</a>
+</li>
+        <li><a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.workspace">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>
+</li>
+      </ul></li>
+      <li><a href="llm-api/reference.html#id11">wrapped_property (tensorrt_llm.llmapi.TorchLlmArgs attribute)</a>, <a href="llm-api/reference.html#id14">[1]</a>, <a href="llm-api/reference.html#id17">[2]</a>, <a href="llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs.wrapped_property">[3]</a>
+
+      <ul>
+        <li><a href="llm-api/reference.html#id20">(tensorrt_llm.llmapi.TrtLlmArgs attribute)</a>, <a href="llm-api/reference.html#id23">[1]</a>, <a href="llm-api/reference.html#id26">[2]</a>, <a href="llm-api/reference.html#id29">[3]</a>, <a href="llm-api/reference.html#id32">[4]</a>, <a href="llm-api/reference.html#tensorrt_llm.llmapi.TrtLlmArgs.wrapped_property">[5]</a>
 </li>
       </ul></li>
   </ul></td>
@@ -8784,6 +9180,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/index.html b/latest/index.html
index c4e44cda0f..a38bd94cb7 100644
--- a/latest/index.html
+++ b/latest/index.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -62,7 +62,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -335,6 +335,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -356,6 +357,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -420,6 +422,7 @@
 <li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -454,6 +457,7 @@
 <li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -641,6 +645,11 @@
 <li class="toctree-l2"><a class="reference internal" href="advanced/expert-parallelism.html#how-to-enable">How to Enable</a></li>
 </ul>
 </li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a><ul>
+<li class="toctree-l2"><a class="reference internal" href="advanced/kv-cache-management.html#hierarchy-pool-block-and-page">Hierarchy: Pool, Block, and Page</a></li>
+<li class="toctree-l2"><a class="reference internal" href="advanced/kv-cache-management.html#events-in-kvcacheeventmanager">Events in <code class="docutils literal notranslate"><span class="pre">KVCacheEventManager</span></code></a></li>
+</ul>
+</li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a><ul>
 <li class="toctree-l2"><a class="reference internal" href="advanced/kv-cache-reuse.html#how-to-enable-kv-cache-reuse">How to enable kv cache reuse</a></li>
 <li class="toctree-l2"><a class="reference internal" href="advanced/kv-cache-reuse.html#performance-expectations">Performance expectations</a></li>
@@ -891,6 +900,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/installation/build-from-source-linux.html b/latest/installation/build-from-source-linux.html
index 046660fea3..77e5047f3b 100644
--- a/latest/installation/build-from-source-linux.html
+++ b/latest/installation/build-from-source-linux.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -848,6 +852,15 @@ pip<span class="w"> </span>install<span class="w"> </span>./build/tensorrt_llm*.
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/installation/grace-hopper.html b/latest/installation/grace-hopper.html
index 6705d95cb5..a88006a446 100644
--- a/latest/installation/grace-hopper.html
+++ b/latest/installation/grace-hopper.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -672,6 +676,15 @@ sudo<span class="w"> </span>apt-get<span class="w"> </span>-y<span class="w"> </
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/installation/linux.html b/latest/installation/linux.html
index 060cf3df86..a3beeadce7 100644
--- a/latest/installation/linux.html
+++ b/latest/installation/linux.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -708,6 +712,15 @@ Please install CUDA toolkit when you see the following message when running Mode
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/key-features.html b/latest/key-features.html
index 0776eb158c..73ad3a2bcf 100644
--- a/latest/key-features.html
+++ b/latest/key-features.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -640,6 +644,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/llm-api/index.html b/latest/llm-api/index.html
index 69ff46e942..e89c88f65c 100644
--- a/latest/llm-api/index.html
+++ b/latest/llm-api/index.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -755,6 +759,15 @@ Refer to the <a class="reference external" href="https://github.com/NVIDIA/Tenso
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/llm-api/reference.html b/latest/llm-api/reference.html
index 82ffe4010e..8ee33f8585 100644
--- a/latest/llm-api/reference.html
+++ b/latest/llm-api/reference.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -548,41 +552,39 @@
 <li><p><strong>moe_expert_parallel_size</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) – The expert parallel size for MoE models’s expert weights. Defaults to None.</p></li>
 <li><p><strong>enable_attention_dp</strong> (<em>bool</em>) – Enable attention data parallel. Defaults to False.</p></li>
 <li><p><strong>cp_config</strong> (<em>Optional</em><em>[</em><em>dict</em><em>]</em>) – Context parallel config. Defaults to None.</p></li>
-<li><p><strong>auto_parallel</strong> (<em>bool</em>) – Enable auto parallel mode. Defaults to False.</p></li>
-<li><p><strong>auto_parallel_world_size</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) – The world size for auto parallel mode. Defaults to None.</p></li>
 <li><p><strong>load_format</strong> (<em>Literal</em><em>[</em><em>'auto'</em><em>, </em><em>'dummy'</em><em>]</em>) – The format to load the model. Defaults to auto.</p></li>
-<li><p><strong>enable_tqdm</strong> (<em>bool</em>) – Enable tqdm for progress bar. Defaults to False.</p></li>
 <li><p><strong>enable_lora</strong> (<em>bool</em>) – Enable LoRA. Defaults to False.</p></li>
 <li><p><strong>lora_config</strong> (<em>Optional</em><em>[</em><em>tensorrt_llm.lora_manager.LoraConfig</em><em>]</em>) – LoRA configuration for the model. Defaults to None.</p></li>
 <li><p><strong>enable_prompt_adapter</strong> (<em>bool</em>) – Enable prompt adapter. Defaults to False.</p></li>
 <li><p><strong>max_prompt_adapter_token</strong> (<em>int</em>) – The maximum number of prompt adapter tokens. Defaults to 0.</p></li>
 <li><p><strong>quant_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig" title="tensorrt_llm.models.modeling_utils.QuantConfig"><em>tensorrt_llm.models.modeling_utils.QuantConfig</em></a><em>]</em>) – Quantization config. Defaults to None.</p></li>
-<li><p><strong>calib_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig" title="tensorrt_llm.llmapi.llm_args.CalibConfig"><em>tensorrt_llm.llmapi.llm_args.CalibConfig</em></a><em>]</em>) – Calibration config. Defaults to None.</p></li>
-<li><p><strong>build_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig" title="tensorrt_llm.builder.BuildConfig"><em>tensorrt_llm.builder.BuildConfig</em></a><em>]</em>) – Build config. Defaults to None.</p></li>
-<li><p><strong>kv_cache_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig" title="tensorrt_llm.llmapi.llm_args.KvCacheConfig"><em>tensorrt_llm.llmapi.llm_args.KvCacheConfig</em></a><em>]</em>) – KV cache config. Defaults to None.</p></li>
+<li><p><strong>kv_cache_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig" title="tensorrt_llm.llmapi.llm_args.KvCacheConfig"><em>tensorrt_llm.llmapi.llm_args.KvCacheConfig</em></a>) – KV cache config. Defaults to None.</p></li>
 <li><p><strong>enable_chunked_prefill</strong> (<em>bool</em>) – Enable chunked prefill. Defaults to False.</p></li>
 <li><p><strong>guided_decoding_backend</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) – Guided decoding backend. Defaults to None.</p></li>
 <li><p><strong>batched_logits_processor</strong> (<em>Optional</em><em>[</em><em>tensorrt_llm.sampling_params.BatchedLogitsProcessor</em><em>]</em>) – Batched logits processor. Defaults to None.</p></li>
 <li><p><strong>iter_stats_max_iterations</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) – The maximum number of iterations for iter stats. Defaults to None.</p></li>
 <li><p><strong>request_stats_max_iterations</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) – The maximum number of iterations for request stats. Defaults to None.</p></li>
-<li><p><strong>workspace</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) – The workspace for the model. Defaults to None.</p></li>
-<li><p><strong>embedding_parallel_mode</strong> (<em>str</em>) – The embedding parallel mode. Defaults to SHARDING_ALONG_VOCAB.</p></li>
-<li><p><strong>fast_build</strong> (<em>bool</em>) – Enable fast build. Defaults to False.</p></li>
-<li><p><strong>enable_build_cache</strong> (<em>Union</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.BuildCacheConfig" title="tensorrt_llm.llmapi.build_cache.BuildCacheConfig"><em>tensorrt_llm.llmapi.build_cache.BuildCacheConfig</em></a><em>, </em><em>bool</em><em>]</em>) – Enable build cache. Defaults to False.</p></li>
 <li><p><strong>peft_cache_config</strong> (<em>Optional</em><em>[</em><em>tensorrt_llm.llmapi.llm_args.PeftCacheConfig</em><em>]</em>) – PEFT cache config. Defaults to None.</p></li>
-<li><p><strong>scheduler_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig" title="tensorrt_llm.llmapi.llm_args.SchedulerConfig"><em>tensorrt_llm.llmapi.llm_args.SchedulerConfig</em></a><em>]</em>) – Scheduler config. Defaults to None.</p></li>
+<li><p><strong>scheduler_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig" title="tensorrt_llm.llmapi.llm_args.SchedulerConfig"><em>tensorrt_llm.llmapi.llm_args.SchedulerConfig</em></a>) – Scheduler config. Defaults to None.</p></li>
 <li><p><strong>cache_transceiver_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.CacheTransceiverConfig" title="tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig"><em>tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig</em></a><em>]</em>) – Cache transceiver config. Defaults to None.</p></li>
-<li><p><strong>speculative_config</strong> (<em>Union</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.MedusaDecodingConfig" title="tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.EagleDecodingConfig" title="tensorrt_llm.llmapi.llm_args.EagleDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.EagleDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.MTPDecodingConfig" title="tensorrt_llm.llmapi.llm_args.MTPDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.MTPDecodingConfig</em></a><em>, </em><em>NoneType</em><em>]</em>) – Speculative decoding config. Defaults to None.</p></li>
+<li><p><strong>speculative_config</strong> (<em>Union</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.MedusaDecodingConfig" title="tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.EagleDecodingConfig" title="tensorrt_llm.llmapi.llm_args.EagleDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.EagleDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.MTPDecodingConfig" title="tensorrt_llm.llmapi.llm_args.MTPDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.MTPDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.NGramDecodingConfig" title="tensorrt_llm.llmapi.llm_args.NGramDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.NGramDecodingConfig</em></a><em>, </em><em>NoneType</em><em>]</em>) – Speculative decoding config. Defaults to None.</p></li>
 <li><p><strong>batching_type</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.BatchingType" title="tensorrt_llm.llmapi.llm_args.BatchingType"><em>tensorrt_llm.llmapi.llm_args.BatchingType</em></a><em>]</em>) – Batching type. Defaults to None.</p></li>
 <li><p><strong>normalize_log_probs</strong> (<em>bool</em>) – Normalize log probabilities. Defaults to False.</p></li>
-<li><p><strong>gather_generation_logits</strong> (<em>bool</em>) – Gather generation logits. Defaults to False.</p></li>
-<li><p><strong>extended_runtime_perf_knob_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig" title="tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig"><em>tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig</em></a><em>]</em>) – Extended runtime perf knob config. Defaults to None.</p></li>
 <li><p><strong>max_batch_size</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) – The maximum batch size. Defaults to None.</p></li>
 <li><p><strong>max_input_len</strong> (<em>int</em>) – The maximum input length. Defaults to 1024.</p></li>
 <li><p><strong>max_seq_len</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) – The maximum sequence length. Defaults to None.</p></li>
 <li><p><strong>max_beam_width</strong> (<em>int</em>) – The maximum beam width. Defaults to 1.</p></li>
 <li><p><strong>max_num_tokens</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) – The maximum number of tokens. Defaults to None.</p></li>
 <li><p><strong>backend</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) – The backend to use. Defaults to None.</p></li>
+<li><p><strong>gather_generation_logits</strong> (<em>bool</em>) – Gather generation logits. Defaults to False.</p></li>
+<li><p><strong>enable_tqdm</strong> (<em>bool</em>) – Enable tqdm for progress bar. Defaults to False.</p></li>
+<li><p><strong>build_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig" title="tensorrt_llm.builder.BuildConfig"><em>tensorrt_llm.builder.BuildConfig</em></a><em>]</em>) – Build config. Defaults to None.</p></li>
+<li><p><strong>workspace</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) – The workspace for the model. Defaults to None.</p></li>
+<li><p><strong>enable_build_cache</strong> (<em>Union</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.BuildCacheConfig" title="tensorrt_llm.llmapi.build_cache.BuildCacheConfig"><em>tensorrt_llm.llmapi.build_cache.BuildCacheConfig</em></a><em>, </em><em>bool</em><em>]</em>) – Enable build cache. Defaults to False.</p></li>
+<li><p><strong>extended_runtime_perf_knob_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig" title="tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig"><em>tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig</em></a><em>]</em>) – Extended runtime perf knob config. Defaults to None.</p></li>
+<li><p><strong>calib_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig" title="tensorrt_llm.llmapi.llm_args.CalibConfig"><em>tensorrt_llm.llmapi.llm_args.CalibConfig</em></a><em>]</em>) – Calibration config. Defaults to None.</p></li>
+<li><p><strong>embedding_parallel_mode</strong> (<em>str</em>) – The embedding parallel mode. Defaults to SHARDING_ALONG_VOCAB.</p></li>
+<li><p><strong>fast_build</strong> (<em>bool</em>) – Enable fast build. Defaults to False.</p></li>
 <li><p><strong>kwargs</strong> (<em>Any</em>) – Advanced arguments passed to <cite>LlmArgs</cite>.</p></li>
 </ul>
 </dd>
@@ -1860,6 +1862,8 @@ The BatchedLogitsProcessor class is recommended for callback creation. The callb
 <dd><em class="sig-param"><span class="n"><span class="pre">token_range_retention_configs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig.TokenRangeRetentionConfig"><span class="pre">tensorrt_llm.bindings.executor.KvCacheRetentionConfig.TokenRangeRetentionConfig</span></a><span class="p"><span class="pre">]</span></span></span></em>,</dd>
 <dd><em class="sig-param"><span class="n"><span class="pre">decode_retention_priority</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">35</span></span></em>,</dd>
 <dd><em class="sig-param"><span class="n"><span class="pre">decode_duration_ms</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">datetime.timedelta</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">transfer_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">tensorrt_llm.bindings.executor.KvCacheTransferMode</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">DRAM</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">directory</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
 </dl>
 
 <span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.__init__" title="Link to this definition">#</a></dt>
@@ -1875,11 +1879,21 @@ The BatchedLogitsProcessor class is recommended for callback creation. The callb
 <em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">decode_retention_priority</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_retention_priority" title="Link to this definition">#</a></dt>
 <dd></dd></dl>
 
+<dl class="py property">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.directory">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">directory</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.directory" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
 <dl class="py property">
 <dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.token_range_retention_configs">
 <em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">token_range_retention_configs</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.token_range_retention_configs" title="Link to this definition">#</a></dt>
 <dd></dd></dl>
 
+<dl class="py property">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.transfer_mode">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">transfer_mode</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.transfer_mode" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
 </dd></dl>
 
 <dl class="py class">
@@ -1926,18 +1940,39 @@ validated to form a valid model.</p>
 <dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size">
 <em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_ngram_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">3</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size" title="Link to this definition">#</a></dt>
 <dd><p>Number of tokens per NGram.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_positive_values</span></code></p></li>
+</ul>
+</dd>
+</dl>
 </dd></dl>
 
 <dl class="py attribute pydantic_field">
 <dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size">
 <em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_verification_set_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">4</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size" title="Link to this definition">#</a></dt>
 <dd><p>Number of NGrams in verification branch per step.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_positive_values</span></code></p></li>
+</ul>
+</dd>
+</dl>
 </dd></dl>
 
 <dl class="py attribute pydantic_field">
 <dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size">
 <em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_window_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">4</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size" title="Link to this definition">#</a></dt>
 <dd><p>Number of NGrams in lookahead branch per step.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_positive_values</span></code></p></li>
+</ul>
+</dd>
+</dl>
 </dd></dl>
 
 <dl class="py attribute">
@@ -1946,9 +1981,9 @@ validated to form a valid model.</p>
 <dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
 </dd></dl>
 
-<dl class="py method">
+<dl class="py method pydantic_validator">
 <dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values">
-<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">validate_positive_values</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">v</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#LookaheadDecodingConfig.validate_positive_values"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values" title="Link to this definition">#</a></dt>
+<em class="property"><span class="pre">validator</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">validate_positive_values</span></span><em class="autodoc_pydantic_validator_arrow property">&#160; <span class="pre">»</span>&#160; </em><em class="xref py py-obj"><span class="pre">max_ngram_size</span></em><em class="property"><span class="pre">,</span> </em><em class="xref py py-obj"><span class="pre">max_window_size</span></em><em class="property"><span class="pre">,</span> </em><em class="xref py py-obj"><span class="pre">max_verification_set_size</span></em><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#LookaheadDecodingConfig.validate_positive_values"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values" title="Link to this definition">#</a></dt>
 <dd></dd></dl>
 
 </dd></dl>
@@ -2011,6 +2046,7 @@ validated to form a valid model.</p>
 <dd><em class="sig-param"><span class="n"><span class="pre">num_eagle_layers</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
 <dd><em class="sig-param"><span class="n"><span class="pre">max_non_leaves_per_layer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
 <dd><em class="sig-param"><span class="n"><span class="pre">pytorch_eagle_weights_path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">eagle3_one_model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
 </dl>
 
 <span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#EagleDecodingConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig" title="Link to this definition">#</a></dt>
@@ -2025,6 +2061,11 @@ validated to form a valid model.</p>
 <em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">dynamic_tree_max_topK</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.dynamic_tree_max_topK" title="Link to this definition">#</a></dt>
 <dd></dd></dl>
 
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.eagle3_one_model">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">eagle3_one_model</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.eagle3_one_model" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
 <dl class="py attribute pydantic_field">
 <dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.eagle_choices">
 <em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">eagle_choices</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.eagle_choices" title="Link to this definition">#</a></dt>
@@ -3172,6 +3213,957 @@ changed, you should remove the caches manually.</p>
 
 </dd></dl>
 
+<dl class="py class">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NGramDecodingConfig">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">NGramDecodingConfig</span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_draft_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">speculative_model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">prompt_lookup_num_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_matching_ngram_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">4</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">is_keep_all</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">is_use_oldest</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">is_public_pool</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#NGramDecodingConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.NGramDecodingConfig" title="Link to this definition">#</a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">DecodingBaseConfig</span></code></p>
+<p>Configuration for NGram drafter speculative decoding.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>prompt_lookup_num_tokens</strong> – int
+The length maximum of draft tokens (can be understood as length maximum of output draft tokens).</p></li>
+<li><p><strong>max_matching_ngram_size</strong> – int
+The length maximum of searching tokens (can be understood as length maximum of input tokens to search).</p></li>
+<li><p><strong>is_keep_all</strong> – bool = True
+Whether to keep all candidate pattern-matches pairs, only one match is kept for each pattern if False.</p></li>
+<li><p><strong>is_use_oldest</strong> – bool = True
+Whether to provide the oldest match when pattern is hit, the newest one is provided if False.</p></li>
+<li><p><strong>is_public_pool</strong> – bool = True
+Whether to use a common pool for all requests, or the pool is private for each request if False.</p></li>
+</ul>
+</dd>
+</dl>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NGramDecodingConfig.decoding_type">
+<span class="sig-name descname"><span class="pre">decoding_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'NGram'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.NGramDecodingConfig.decoding_type" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NGramDecodingConfig.from_dict">
+<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#NGramDecodingConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.NGramDecodingConfig.from_dict" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NGramDecodingConfig.is_keep_all">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">is_keep_all</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.NGramDecodingConfig.is_keep_all" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NGramDecodingConfig.is_public_pool">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">is_public_pool</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.NGramDecodingConfig.is_public_pool" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NGramDecodingConfig.is_use_oldest">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">is_use_oldest</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.NGramDecodingConfig.is_use_oldest" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NGramDecodingConfig.max_matching_ngram_size">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_matching_ngram_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">4</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.NGramDecodingConfig.max_matching_ngram_size" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NGramDecodingConfig.model_config">
+<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.NGramDecodingConfig.model_config" title="Link to this definition">#</a></dt>
+<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.NGramDecodingConfig.prompt_lookup_num_tokens">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">prompt_lookup_num_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">2</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.NGramDecodingConfig.prompt_lookup_num_tokens" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LlmArgs">
+<span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">LlmArgs</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.LlmArgs" title="Link to this definition">#</a></dt>
+<dd><p>alias of <a class="reference internal" href="#tensorrt_llm.llmapi.TrtLlmArgs" title="tensorrt_llm.llmapi.llm_args.TrtLlmArgs"><code class="xref py py-class docutils literal notranslate"><span class="pre">TrtLlmArgs</span></code></a></p>
+</dd></dl>
+
+<dl class="py class">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">TorchLlmArgs</span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">*</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">model:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">~pathlib.Path</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">~pathlib.Path</span> <span class="pre">|</span> <span class="pre">~transformers.tokenization_utils_base.PreTrainedTokenizerBase</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.tokenizer.TokenizerBase</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer_mode:</span> <span class="pre">~typing.Literal['auto'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">'slow']</span> <span class="pre">=</span> <span class="pre">'auto'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">skip_tokenizer_init:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">trust_remote_code:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">tensor_parallel_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">dtype:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'auto'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">revision:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer_revision:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">pipeline_parallel_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">context_parallel_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">gpus_per_node:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">moe_cluster_parallel_size:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">moe_tensor_parallel_size:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">moe_expert_parallel_size:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_attention_dp:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">cp_config:</span> <span class="pre">dict</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">load_format:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.LoadFormat</span> <span class="pre">=</span> <span class="pre">LoadFormat.AUTO</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_lora:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_lora_rank:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_loras:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">4</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_cpu_loras:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">4</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">lora_config:</span> <span class="pre">~tensorrt_llm.lora_manager.LoraConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_prompt_adapter:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_prompt_adapter_token:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">quant_config:</span> <span class="pre">~tensorrt_llm.models.modeling_utils.QuantConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.KvCacheConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_chunked_prefill:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">guided_decoding_backend:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">batched_logits_processor:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">iter_stats_max_iterations:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">request_stats_max_iterations:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">peft_cache_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.PeftCacheConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">scheduler_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.SchedulerConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">cache_transceiver_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">speculative_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.EagleDecodingConfig</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.MTPDecodingConfig</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.NGramDecodingConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">batching_type:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.BatchingType</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">normalize_log_probs:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_batch_size:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_seq_len:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_beam_width:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_num_tokens:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">backend:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">gather_generation_logits:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">_num_postprocess_workers:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">_postprocess_tokenizer_dir:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">_reasoning_parser:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">decoding_config:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">_mpi_session:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">build_config:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">use_cuda_graph:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">cuda_graph_batch_sizes:</span> <span class="pre">~typing.List[int]</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">cuda_graph_max_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">cuda_graph_padding_enabled:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">disable_overlap_scheduler:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">moe_max_num_tokens:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">moe_load_balancer:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">attn_backend:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'TRTLLM'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">moe_backend:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'CUTLASS'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">mixed_sampler:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_trtllm_sampler:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_dtype:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'auto'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">use_kv_cache:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_iter_perf_stats:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_iter_req_stats:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">print_iter_log:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">torch_compile_enabled:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">torch_compile_fullgraph:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">torch_compile_inductor_enabled:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">torch_compile_piecewise_cuda_graph:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">torch_compile_enable_userbuffers:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">autotuner_enabled:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_layerwise_nvtx_marker:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">auto_deploy_config:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_min_latency:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">**extra_data:</span> <span class="pre">~typing.Any</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#TorchLlmArgs"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs" title="Link to this definition">#</a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">BaseLlmArgs</span></code></p>
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.attn_backend">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">attn_backend</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'TRTLLM'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.attn_backend" title="Link to this definition">#</a></dt>
+<dd><p>Attention backend to use.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.auto_deploy_config">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">auto_deploy_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">object</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.auto_deploy_config" title="Link to this definition">#</a></dt>
+<dd><p>Auto deploy config.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.autotuner_enabled">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">autotuner_enabled</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.autotuner_enabled" title="Link to this definition">#</a></dt>
+<dd><p>Enable autotuner only when torch compile is enabled.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.build_config">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">build_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">object</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.build_config" title="Link to this definition">#</a></dt>
+<dd><p>Build config.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method pydantic_validator">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.convert_load_format">
+<em class="property"><span class="pre">validator</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">convert_load_format</span></span><em class="autodoc_pydantic_validator_arrow property">&#160; <span class="pre">»</span>&#160; </em><em class="xref py py-obj"><span class="pre">load_format</span></em><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#TorchLlmArgs.convert_load_format"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.convert_load_format" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_batch_sizes">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cuda_graph_batch_sizes</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_batch_sizes" title="Link to this definition">#</a></dt>
+<dd><p>List of batch sizes to create CUDA graphs for.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_max_batch_size">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cuda_graph_max_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_max_batch_size" title="Link to this definition">#</a></dt>
+<dd><p>Maximum batch size for CUDA graphs.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_max_batch_size</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_padding_enabled">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cuda_graph_padding_enabled</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_padding_enabled" title="Link to this definition">#</a></dt>
+<dd><p>If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.decoding_config">
+<span class="sig-name descname"><span class="pre">decoding_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">object</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.decoding_config" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.msg">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.msg" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.wrapped_property">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.wrapped_property" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.field_name">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.field_name" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.disable_overlap_scheduler">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">disable_overlap_scheduler</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.disable_overlap_scheduler" title="Link to this definition">#</a></dt>
+<dd><p>Disable the overlap scheduler.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_perf_stats">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_iter_perf_stats</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_perf_stats" title="Link to this definition">#</a></dt>
+<dd><p>Enable iteration performance statistics.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_req_stats">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_iter_req_stats</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_req_stats" title="Link to this definition">#</a></dt>
+<dd><p>If true, enables per request stats per iteration. Must also set enable_iter_perf_stats to true to get request stats.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.enable_layerwise_nvtx_marker">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_layerwise_nvtx_marker</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_layerwise_nvtx_marker" title="Link to this definition">#</a></dt>
+<dd><p>If true, enable layerwise nvtx marker.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.enable_min_latency">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_min_latency</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_min_latency" title="Link to this definition">#</a></dt>
+<dd><p>If true, enable min-latency mode. Currently only used for Llama4.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.enable_trtllm_sampler">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_trtllm_sampler</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_trtllm_sampler" title="Link to this definition">#</a></dt>
+<dd><p>If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py property">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.extra_resource_managers">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">extra_resource_managers</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Dict</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">object</span><span class="p"><span class="pre">]</span></span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.extra_resource_managers" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config">
+<span class="sig-name descname"><span class="pre">get_pytorch_backend_config</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">PyTorchConfig</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#TorchLlmArgs.get_pytorch_backend_config"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.kv_cache_dtype">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">kv_cache_dtype</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'auto'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.kv_cache_dtype" title="Link to this definition">#</a></dt>
+<dd><p>Data type for KV cache.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.load_format">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">load_format</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">LoadFormat</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">LoadFormat.AUTO</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.load_format" title="Link to this definition">#</a></dt>
+<dd><p>How to load the model weights. By default, detect the weight type from the model checkpoint.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">convert_load_format</span></code></p></li>
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.max_cpu_loras">
+<span class="sig-name descname"><span class="pre">max_cpu_loras</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.max_cpu_loras" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id10">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#id10" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id11">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#id11" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id12">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#id12" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.max_lora_rank">
+<span class="sig-name descname"><span class="pre">max_lora_rank</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.max_lora_rank" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id13">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#id13" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id14">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#id14" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id15">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#id15" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.max_loras">
+<span class="sig-name descname"><span class="pre">max_loras</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.max_loras" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id16">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#id16" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id17">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#id17" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id18">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#id18" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.mixed_sampler">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">mixed_sampler</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.mixed_sampler" title="Link to this definition">#</a></dt>
+<dd><p>If true, will iterate over sampling_params of each request and use the corresponding sampling strategy, e.g. top-k, top-p, etc.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.model_config">
+<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{'arbitrary_types_allowed':</span> <span class="pre">True,</span> <span class="pre">'extra':</span> <span class="pre">'allow'}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.model_config" title="Link to this definition">#</a></dt>
+<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.model_post_init">
+<span class="sig-name descname"><span class="pre">model_post_init</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">_TorchLlmArgs__context</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#TorchLlmArgs.model_post_init"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.model_post_init" title="Link to this definition">#</a></dt>
+<dd><p>Override this method to perform additional initialization after <cite>__init__</cite> and <cite>model_construct</cite>.
+This is useful if you want to do some validation that requires the entire model to be initialized.</p>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.moe_backend">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">moe_backend</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'CUTLASS'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.moe_backend" title="Link to this definition">#</a></dt>
+<dd><p>MoE backend to use.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.moe_load_balancer">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">moe_load_balancer</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">object</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.moe_load_balancer" title="Link to this definition">#</a></dt>
+<dd><p>Configuration for MoE load balancing.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.moe_max_num_tokens">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">moe_max_num_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.moe_max_num_tokens" title="Link to this definition">#</a></dt>
+<dd><p>If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.print_iter_log">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">print_iter_log</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.print_iter_log" title="Link to this definition">#</a></dt>
+<dd><p>Print iteration logs.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enable_userbuffers">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">torch_compile_enable_userbuffers</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enable_userbuffers" title="Link to this definition">#</a></dt>
+<dd><p>When torch compile is enabled, userbuffers is enabled by default.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enabled">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">torch_compile_enabled</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enabled" title="Link to this definition">#</a></dt>
+<dd><p>Enable torch.compile optimization.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_fullgraph">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">torch_compile_fullgraph</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_fullgraph" title="Link to this definition">#</a></dt>
+<dd><p>Enable full graph compilation in torch.compile.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_inductor_enabled">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">torch_compile_inductor_enabled</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_inductor_enabled" title="Link to this definition">#</a></dt>
+<dd><p>Enable inductor backend in torch.compile.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_piecewise_cuda_graph">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">torch_compile_piecewise_cuda_graph</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_piecewise_cuda_graph" title="Link to this definition">#</a></dt>
+<dd><p>Enable piecewise CUDA graph in torch.compile.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.use_cuda_graph">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_cuda_graph</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.use_cuda_graph" title="Link to this definition">#</a></dt>
+<dd><p>If true, use CUDA graphs for decoding. CUDA graphs are only created for the batch sizes in cuda_graph_batch_sizes, and are enabled for batches that consist of decoding requests <em>only</em> (the reason is that it’s hard to capture a single graph with prefill requests since the input shapes are a function of the sequence lengths). Note that each CUDA graph can use up to 200 MB of extra memory.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.use_kv_cache">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_kv_cache</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.use_kv_cache" title="Link to this definition">#</a></dt>
+<dd><p>Whether to use KV cache.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Validated by<span class="colon">:</span></dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><code class="xref py py-obj docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+<dl class="py method pydantic_validator">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_config">
+<em class="property"><span class="pre">validator</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">validate_cuda_graph_config</span></span><em class="autodoc_pydantic_validator_arrow property">&#160; <span class="pre">»</span>&#160; </em><em class="xref py py-obj"><span class="pre">all</span> <span class="pre">fields</span></em><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#TorchLlmArgs.validate_cuda_graph_config"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_config" title="Link to this definition">#</a></dt>
+<dd><p>Validate CUDA graph configuration.</p>
+<p>Ensures that:
+1. If cuda_graph_batch_sizes is provided, cuda_graph_max_batch_size must be 0
+2. If cuda_graph_batch_sizes is not provided, it is generated based on cuda_graph_max_batch_size
+3. If both are provided, cuda_graph_batch_sizes must match the generated values</p>
+</dd></dl>
+
+<dl class="py method pydantic_validator">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_max_batch_size">
+<em class="property"><span class="pre">validator</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">validate_cuda_graph_max_batch_size</span></span><em class="autodoc_pydantic_validator_arrow property">&#160; <span class="pre">»</span>&#160; </em><em class="xref py py-obj"><span class="pre">cuda_graph_max_batch_size</span></em><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#TorchLlmArgs.validate_cuda_graph_max_batch_size"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_max_batch_size" title="Link to this definition">#</a></dt>
+<dd><p>Validate cuda_graph_max_batch_size is non-negative.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py class">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs">
+<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">TrtLlmArgs</span></span><span class="sig-paren">(</span>
+
+<dl>
+<dd><em class="sig-param"><span class="n"><span class="pre">*</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">model:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">~pathlib.Path</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">~pathlib.Path</span> <span class="pre">|</span> <span class="pre">~transformers.tokenization_utils_base.PreTrainedTokenizerBase</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.tokenizer.TokenizerBase</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer_mode:</span> <span class="pre">~typing.Literal['auto'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">'slow']</span> <span class="pre">=</span> <span class="pre">'auto'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">skip_tokenizer_init:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">trust_remote_code:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">tensor_parallel_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">dtype:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'auto'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">revision:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer_revision:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">pipeline_parallel_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">context_parallel_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">gpus_per_node:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">moe_cluster_parallel_size:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">moe_tensor_parallel_size:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">moe_expert_parallel_size:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_attention_dp:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">cp_config:</span> <span class="pre">dict</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">load_format:</span> <span class="pre">~typing.Literal['auto'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">'dummy']</span> <span class="pre">=</span> <span class="pre">'auto'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_lora:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_lora_rank:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_loras:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">4</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_cpu_loras:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">4</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">lora_config:</span> <span class="pre">~tensorrt_llm.lora_manager.LoraConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_prompt_adapter:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_prompt_adapter_token:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">quant_config:</span> <span class="pre">~tensorrt_llm.models.modeling_utils.QuantConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.KvCacheConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_chunked_prefill:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">guided_decoding_backend:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">batched_logits_processor:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">iter_stats_max_iterations:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">request_stats_max_iterations:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">peft_cache_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.PeftCacheConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">scheduler_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.SchedulerConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">cache_transceiver_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">speculative_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.EagleDecodingConfig</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.MTPDecodingConfig</span> <span class="pre">|</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.NGramDecodingConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">batching_type:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.BatchingType</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">normalize_log_probs:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_batch_size:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_seq_len:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_beam_width:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">max_num_tokens:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">backend:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">gather_generation_logits:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">_num_postprocess_workers:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">_postprocess_tokenizer_dir:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">_reasoning_parser:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">decoding_config:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">_mpi_session:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">auto_parallel:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">auto_parallel_world_size:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_tqdm:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">build_config:</span> <span class="pre">object</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">workspace:</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">enable_build_cache:</span> <span class="pre">object</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">extended_runtime_perf_knob_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">calib_config:</span> <span class="pre">~tensorrt_llm.llmapi.llm_args.CalibConfig</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">embedding_parallel_mode:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'SHARDING_ALONG_VOCAB'</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">fast_build:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
+<dd><em class="sig-param"><span class="n"><span class="pre">**extra_data:</span> <span class="pre">~typing.Any</span></span></em>,</dd>
+</dl>
+
+<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#TrtLlmArgs"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs" title="Link to this definition">#</a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">BaseLlmArgs</span></code></p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel">
+<span class="sig-name descname"><span class="pre">auto_parallel</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.msg">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.msg" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.wrapped_property">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.wrapped_property" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.field_name">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.field_name" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py property">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_config">
+<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">auto_parallel_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">AutoParallelConfig</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_config" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_world_size">
+<span class="sig-name descname"><span class="pre">auto_parallel_world_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_world_size" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id19">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#id19" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id20">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#id20" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id21">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#id21" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.build_config">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">build_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">object</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.build_config" title="Link to this definition">#</a></dt>
+<dd><p>Build config.</p>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.calib_config">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">calib_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig" title="tensorrt_llm.llmapi.llm_args.CalibConfig"><span class="pre">CalibConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.calib_config" title="Link to this definition">#</a></dt>
+<dd><p>Calibration config.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.decoding_config">
+<span class="sig-name descname"><span class="pre">decoding_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">object</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.decoding_config" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id22">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#id22" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id23">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#id23" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id24">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#id24" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.embedding_parallel_mode">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">embedding_parallel_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'SHARDING_ALONG_VOCAB'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.embedding_parallel_mode" title="Link to this definition">#</a></dt>
+<dd><p>The embedding parallel mode.</p>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.enable_build_cache">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_build_cache</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">object</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.enable_build_cache" title="Link to this definition">#</a></dt>
+<dd><p>Enable build cache.</p>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.enable_tqdm">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_tqdm</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.enable_tqdm" title="Link to this definition">#</a></dt>
+<dd><p>Enable tqdm for progress bar.</p>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.extended_runtime_perf_knob_config">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">extended_runtime_perf_knob_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig" title="tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig"><span class="pre">ExtendedRuntimePerfKnobConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.extended_runtime_perf_knob_config" title="Link to this definition">#</a></dt>
+<dd><p>Extended runtime perf knob config.</p>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.fast_build">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">fast_build</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.fast_build" title="Link to this definition">#</a></dt>
+<dd><p>Enable fast build.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.max_cpu_loras">
+<span class="sig-name descname"><span class="pre">max_cpu_loras</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.max_cpu_loras" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id25">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#id25" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id26">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#id26" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id27">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#id27" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.max_lora_rank">
+<span class="sig-name descname"><span class="pre">max_lora_rank</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.max_lora_rank" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id28">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#id28" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id29">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#id29" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id30">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#id30" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.max_loras">
+<span class="sig-name descname"><span class="pre">max_loras</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.max_loras" title="Link to this definition">#</a></dt>
+<dd><p>Read-only data descriptor used to emit a runtime deprecation warning before accessing a deprecated field.</p>
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id31">
+<span class="sig-name descname"><span class="pre">msg</span></span><a class="headerlink" href="#id31" title="Link to this definition">#</a></dt>
+<dd><p>The deprecation message to be emitted.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id32">
+<span class="sig-name descname"><span class="pre">wrapped_property</span></span><a class="headerlink" href="#id32" title="Link to this definition">#</a></dt>
+<dd><p>The property instance if the deprecated field is a computed field, or <cite>None</cite>.</p>
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="id33">
+<span class="sig-name descname"><span class="pre">field_name</span></span><a class="headerlink" href="#id33" title="Link to this definition">#</a></dt>
+<dd><p>The name of the field being deprecated.</p>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.model_config">
+<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{'arbitrary_types_allowed':</span> <span class="pre">True,</span> <span class="pre">'extra':</span> <span class="pre">'allow'}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.model_config" title="Link to this definition">#</a></dt>
+<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
+</dd></dl>
+
+<dl class="py method">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.model_post_init">
+<span class="sig-name descname"><span class="pre">model_post_init</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">_TrtLlmArgs__context</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#TrtLlmArgs.model_post_init"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.model_post_init" title="Link to this definition">#</a></dt>
+<dd><p>Override this method to perform additional initialization after <cite>__init__</cite> and <cite>model_construct</cite>.
+This is useful if you want to do some validation that requires the entire model to be initialized.</p>
+</dd></dl>
+
+<dl class="py attribute pydantic_field">
+<dt class="sig sig-object py" id="tensorrt_llm.llmapi.TrtLlmArgs.workspace">
+<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">workspace</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.TrtLlmArgs.workspace" title="Link to this definition">#</a></dt>
+<dd><p>The workspace for the model.</p>
+</dd></dl>
+
+</dd></dl>
+
 </section>
 
 
@@ -3371,7 +4363,9 @@ changed, you should remove the caches manually.</p>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_duration_ms"><code class="docutils literal notranslate"><span class="pre">decode_duration_ms</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_retention_priority"><code class="docutils literal notranslate"><span class="pre">decode_retention_priority</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.directory"><code class="docutils literal notranslate"><span class="pre">directory</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.token_range_retention_configs"><code class="docutils literal notranslate"><span class="pre">token_range_retention_configs</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.transfer_mode"><code class="docutils literal notranslate"><span class="pre">transfer_mode</span></code></a></li>
 </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig"><code class="docutils literal notranslate"><span class="pre">LookaheadDecodingConfig</span></code></a><ul class="nav section-nav flex-column">
@@ -3383,7 +4377,7 @@ changed, you should remove the caches manually.</p>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size"><code class="docutils literal notranslate"><span class="pre">max_verification_set_size</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size"><code class="docutils literal notranslate"><span class="pre">max_window_size</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
-<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values"><code class="docutils literal notranslate"><span class="pre">validate_positive_values()</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values"><code class="docutils literal notranslate"><span class="pre">validate_positive_values</span></code></a></li>
 </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MedusaDecodingConfig"><code class="docutils literal notranslate"><span class="pre">MedusaDecodingConfig</span></code></a><ul class="nav section-nav flex-column">
@@ -3397,6 +4391,7 @@ changed, you should remove the caches manually.</p>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig"><code class="docutils literal notranslate"><span class="pre">EagleDecodingConfig</span></code></a><ul class="nav section-nav flex-column">
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.decoding_type"><code class="docutils literal notranslate"><span class="pre">decoding_type</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.dynamic_tree_max_topK"><code class="docutils literal notranslate"><span class="pre">dynamic_tree_max_topK</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.eagle3_one_model"><code class="docutils literal notranslate"><span class="pre">eagle3_one_model</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.eagle_choices"><code class="docutils literal notranslate"><span class="pre">eagle_choices</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">from_dict()</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.greedy_sampling"><code class="docutils literal notranslate"><span class="pre">greedy_sampling</span></code></a></li>
@@ -3576,6 +4571,129 @@ changed, you should remove the caches manually.</p>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CacheTransceiverConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
 </ul>
 </li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.NGramDecodingConfig"><code class="docutils literal notranslate"><span class="pre">NGramDecodingConfig</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.NGramDecodingConfig.decoding_type"><code class="docutils literal notranslate"><span class="pre">decoding_type</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.NGramDecodingConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">from_dict()</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.NGramDecodingConfig.is_keep_all"><code class="docutils literal notranslate"><span class="pre">is_keep_all</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.NGramDecodingConfig.is_public_pool"><code class="docutils literal notranslate"><span class="pre">is_public_pool</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.NGramDecodingConfig.is_use_oldest"><code class="docutils literal notranslate"><span class="pre">is_use_oldest</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.NGramDecodingConfig.max_matching_ngram_size"><code class="docutils literal notranslate"><span class="pre">max_matching_ngram_size</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.NGramDecodingConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.NGramDecodingConfig.prompt_lookup_num_tokens"><code class="docutils literal notranslate"><span class="pre">prompt_lookup_num_tokens</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LlmArgs"><code class="docutils literal notranslate"><span class="pre">LlmArgs</span></code></a></li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs"><code class="docutils literal notranslate"><span class="pre">TorchLlmArgs</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.attn_backend"><code class="docutils literal notranslate"><span class="pre">attn_backend</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.auto_deploy_config"><code class="docutils literal notranslate"><span class="pre">auto_deploy_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.autotuner_enabled"><code class="docutils literal notranslate"><span class="pre">autotuner_enabled</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.build_config"><code class="docutils literal notranslate"><span class="pre">build_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.convert_load_format"><code class="docutils literal notranslate"><span class="pre">convert_load_format</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_batch_sizes"><code class="docutils literal notranslate"><span class="pre">cuda_graph_batch_sizes</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_max_batch_size"><code class="docutils literal notranslate"><span class="pre">cuda_graph_max_batch_size</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_padding_enabled"><code class="docutils literal notranslate"><span class="pre">cuda_graph_padding_enabled</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.decoding_config"><code class="docutils literal notranslate"><span class="pre">decoding_config</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.msg"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.wrapped_property"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.field_name"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.disable_overlap_scheduler"><code class="docutils literal notranslate"><span class="pre">disable_overlap_scheduler</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_perf_stats"><code class="docutils literal notranslate"><span class="pre">enable_iter_perf_stats</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_req_stats"><code class="docutils literal notranslate"><span class="pre">enable_iter_req_stats</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_layerwise_nvtx_marker"><code class="docutils literal notranslate"><span class="pre">enable_layerwise_nvtx_marker</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_min_latency"><code class="docutils literal notranslate"><span class="pre">enable_min_latency</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.enable_trtllm_sampler"><code class="docutils literal notranslate"><span class="pre">enable_trtllm_sampler</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.extra_resource_managers"><code class="docutils literal notranslate"><span class="pre">extra_resource_managers</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config"><code class="docutils literal notranslate"><span class="pre">get_pytorch_backend_config()</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.kv_cache_dtype"><code class="docutils literal notranslate"><span class="pre">kv_cache_dtype</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.load_format"><code class="docutils literal notranslate"><span class="pre">load_format</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.max_cpu_loras"><code class="docutils literal notranslate"><span class="pre">max_cpu_loras</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id10"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id11"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id12"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.max_lora_rank"><code class="docutils literal notranslate"><span class="pre">max_lora_rank</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id13"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id14"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id15"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.max_loras"><code class="docutils literal notranslate"><span class="pre">max_loras</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id16"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id17"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id18"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.mixed_sampler"><code class="docutils literal notranslate"><span class="pre">mixed_sampler</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.model_post_init"><code class="docutils literal notranslate"><span class="pre">model_post_init()</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.moe_backend"><code class="docutils literal notranslate"><span class="pre">moe_backend</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.moe_load_balancer"><code class="docutils literal notranslate"><span class="pre">moe_load_balancer</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.moe_max_num_tokens"><code class="docutils literal notranslate"><span class="pre">moe_max_num_tokens</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.print_iter_log"><code class="docutils literal notranslate"><span class="pre">print_iter_log</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enable_userbuffers"><code class="docutils literal notranslate"><span class="pre">torch_compile_enable_userbuffers</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enabled"><code class="docutils literal notranslate"><span class="pre">torch_compile_enabled</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_fullgraph"><code class="docutils literal notranslate"><span class="pre">torch_compile_fullgraph</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_inductor_enabled"><code class="docutils literal notranslate"><span class="pre">torch_compile_inductor_enabled</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_piecewise_cuda_graph"><code class="docutils literal notranslate"><span class="pre">torch_compile_piecewise_cuda_graph</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.use_cuda_graph"><code class="docutils literal notranslate"><span class="pre">use_cuda_graph</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.use_kv_cache"><code class="docutils literal notranslate"><span class="pre">use_kv_cache</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_config"><code class="docutils literal notranslate"><span class="pre">validate_cuda_graph_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_max_batch_size"><code class="docutils literal notranslate"><span class="pre">validate_cuda_graph_max_batch_size</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs"><code class="docutils literal notranslate"><span class="pre">TrtLlmArgs</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel"><code class="docutils literal notranslate"><span class="pre">auto_parallel</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.msg"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.wrapped_property"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.field_name"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_config"><code class="docutils literal notranslate"><span class="pre">auto_parallel_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_world_size"><code class="docutils literal notranslate"><span class="pre">auto_parallel_world_size</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id19"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id20"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id21"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.build_config"><code class="docutils literal notranslate"><span class="pre">build_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.calib_config"><code class="docutils literal notranslate"><span class="pre">calib_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.decoding_config"><code class="docutils literal notranslate"><span class="pre">decoding_config</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id22"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id23"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id24"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.embedding_parallel_mode"><code class="docutils literal notranslate"><span class="pre">embedding_parallel_mode</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.enable_build_cache"><code class="docutils literal notranslate"><span class="pre">enable_build_cache</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.enable_tqdm"><code class="docutils literal notranslate"><span class="pre">enable_tqdm</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.extended_runtime_perf_knob_config"><code class="docutils literal notranslate"><span class="pre">extended_runtime_perf_knob_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.fast_build"><code class="docutils literal notranslate"><span class="pre">fast_build</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.max_cpu_loras"><code class="docutils literal notranslate"><span class="pre">max_cpu_loras</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id25"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id26"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id27"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.max_lora_rank"><code class="docutils literal notranslate"><span class="pre">max_lora_rank</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id28"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id29"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id30"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.max_loras"><code class="docutils literal notranslate"><span class="pre">max_loras</span></code></a><ul class="nav section-nav flex-column">
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id31"><code class="docutils literal notranslate"><span class="pre">msg</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id32"><code class="docutils literal notranslate"><span class="pre">wrapped_property</span></code></a></li>
+<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#id33"><code class="docutils literal notranslate"><span class="pre">field_name</span></code></a></li>
+</ul>
+</li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.model_post_init"><code class="docutils literal notranslate"><span class="pre">model_post_init()</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.TrtLlmArgs.workspace"><code class="docutils literal notranslate"><span class="pre">workspace</span></code></a></li>
+</ul>
+</li>
 </ul>
   </nav></div>
 
@@ -3667,6 +4785,15 @@ changed, you should remove the caches manually.</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/objects.inv b/latest/objects.inv
index 49f74f2b4e..8a655c18d0 100644
Binary files a/latest/objects.inv and b/latest/objects.inv differ
diff --git a/latest/overview.html b/latest/overview.html
index 2e9115ebaa..fbb0cb079b 100644
--- a/latest/overview.html
+++ b/latest/overview.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -698,6 +702,15 @@ Certain limitations might apply. Refer to the <a class="reference internal" href
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/perf-analysis.html b/latest/performance/perf-analysis.html
index 679ec35c22..527a17526d 100644
--- a/latest/performance/perf-analysis.html
+++ b/latest/performance/perf-analysis.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -757,6 +761,15 @@ python3<span class="w"> </span>benchmarks/cpp/prepare_dataset.py<span class="w">
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/perf-benchmarking.html b/latest/performance/perf-benchmarking.html
index 5c31b3a82f..1c7fccb60e 100644
--- a/latest/performance/perf-benchmarking.html
+++ b/latest/performance/perf-benchmarking.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1166,8 +1170,7 @@ follow when a checkpoint does not specify a KV cache quantization algorithm:</p>
 </div>
 <p>If you would like to force the KV cache quantizaton, you can specify the following in the YAML file to force the precision
 when the checkpoint precision is <code class="docutils literal notranslate"><span class="pre">null</span></code>:</p>
-<div class="highlight-yaml notranslate"><div class="highlight"><pre><span></span><span class="nt">pytorch_backend_config</span><span class="p">:</span>
-<span class="w">  </span><span class="nt">kv_cache_dtype</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;fp8&quot;</span>
+<div class="highlight-yaml notranslate"><div class="highlight"><pre><span></span><span class="nt">kv_cache_dtype</span><span class="p">:</span><span class="w"> </span><span class="s">&quot;fp8&quot;</span>
 </pre></div>
 </div>
 <div class="admonition tip">
@@ -1537,6 +1540,15 @@ The choices are specified with a YAML file like the following example (<code cla
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/perf-overview.html b/latest/performance/perf-overview.html
index efeb575c54..6b5d83fde2 100644
--- a/latest/performance/perf-overview.html
+++ b/latest/performance/perf-overview.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1223,11 +1227,9 @@ a model name (HuggingFace reference or path to a local model), a <a class="refer
 </pre></div>
 </div>
 <p><code class="docutils literal notranslate"><span class="pre">llm_options.yml</span></code></p>
-<div class="highlight-yaml notranslate"><div class="highlight"><pre><span></span><span class="w"> </span><span class="nt">pytorch_backend_config</span><span class="p">:</span>
-<span class="w">  </span><span class="nt">enable_overlap_scheduler</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span>
-<span class="w">  </span><span class="nt">use_cuda_graph</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span>
-<span class="w">  </span><span class="nt">cuda_graph_padding_enabled</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span>
-<span class="w">  </span><span class="nt">cuda_graph_batch_sizes</span><span class="p">:</span>
+<div class="highlight-yaml notranslate"><div class="highlight"><pre><span></span><span class="nt">use_cuda_graph</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span>
+<span class="nt">cuda_graph_padding_enabled</span><span class="p">:</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">true</span>
+<span class="nt">cuda_graph_batch_sizes</span><span class="p">:</span>
 <span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">1</span>
 <span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">2</span>
 <span class="w">  </span><span class="p p-Indicator">-</span><span class="w"> </span><span class="l l-Scalar l-Scalar-Plain">4</span>
@@ -1462,6 +1464,15 @@ using the <code class="docutils literal notranslate"><span class="pre">--kv_cach
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/performance-tuning-guide/benchmarking-default-performance.html b/latest/performance/performance-tuning-guide/benchmarking-default-performance.html
index 23b85795d8..59ab78800c 100644
--- a/latest/performance/performance-tuning-guide/benchmarking-default-performance.html
+++ b/latest/performance/performance-tuning-guide/benchmarking-default-performance.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -908,6 +912,15 @@ P99:<span class="w"> </span><span class="m">1</span>.00
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/performance-tuning-guide/deciding-model-sharding-strategy.html b/latest/performance/performance-tuning-guide/deciding-model-sharding-strategy.html
index 093ab20674..f93cbdd593 100644
--- a/latest/performance/performance-tuning-guide/deciding-model-sharding-strategy.html
+++ b/latest/performance/performance-tuning-guide/deciding-model-sharding-strategy.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -687,6 +691,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/performance-tuning-guide/fp8-quantization.html b/latest/performance/performance-tuning-guide/fp8-quantization.html
index 8258b55aa9..990e8fb4bb 100644
--- a/latest/performance/performance-tuning-guide/fp8-quantization.html
+++ b/latest/performance/performance-tuning-guide/fp8-quantization.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1019,6 +1023,15 @@ accuracy loss is unacceptable.</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/performance-tuning-guide/index.html b/latest/performance/performance-tuning-guide/index.html
index 2f6efd3851..6da81fa5ef 100644
--- a/latest/performance/performance-tuning-guide/index.html
+++ b/latest/performance/performance-tuning-guide/index.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -678,6 +682,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html b/latest/performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html
index 30d4b2e723..84b30626d9 100644
--- a/latest/performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html
+++ b/latest/performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -869,6 +873,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/performance-tuning-guide/useful-build-time-flags.html b/latest/performance/performance-tuning-guide/useful-build-time-flags.html
index 129ae2cdf8..d4bd801cd6 100644
--- a/latest/performance/performance-tuning-guide/useful-build-time-flags.html
+++ b/latest/performance/performance-tuning-guide/useful-build-time-flags.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -932,6 +936,15 @@ This can be enabled via the LLM-API as such</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/performance/performance-tuning-guide/useful-runtime-flags.html b/latest/performance/performance-tuning-guide/useful-runtime-flags.html
index ac6bcf3b92..d4d2715d26 100644
--- a/latest/performance/performance-tuning-guide/useful-runtime-flags.html
+++ b/latest/performance/performance-tuning-guide/useful-runtime-flags.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -855,6 +859,15 @@ via <code class="docutils literal notranslate"><span class="pre">KVCacheConfig</
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/py-modindex.html b/latest/py-modindex.html
index 5491802114..a871415b16 100644
--- a/latest/py-modindex.html
+++ b/latest/py-modindex.html
@@ -50,7 +50,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
 
@@ -332,6 +332,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -353,6 +354,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -417,6 +419,7 @@
 <li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -451,6 +454,7 @@
 <li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -674,6 +678,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/python-api/tensorrt_llm.functional.html b/latest/python-api/tensorrt_llm.functional.html
index d91f98cab4..65f4ac9f8e 100644
--- a/latest/python-api/tensorrt_llm.functional.html
+++ b/latest/python-api/tensorrt_llm.functional.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -634,6 +638,11 @@
 <span class="sig-name descname"><span class="pre">AUTO</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">3</span></em><a class="headerlink" href="#tensorrt_llm.functional.AllReduceStrategy.AUTO" title="Link to this definition">#</a></dt>
 <dd></dd></dl>
 
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.functional.AllReduceStrategy.LOWPRECISION">
+<span class="sig-name descname"><span class="pre">LOWPRECISION</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">6</span></em><a class="headerlink" href="#tensorrt_llm.functional.AllReduceStrategy.LOWPRECISION" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
 <dl class="py attribute">
 <dt class="sig sig-object py" id="tensorrt_llm.functional.AllReduceStrategy.MIN_LATENCY">
 <span class="sig-name descname"><span class="pre">MIN_LATENCY</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.functional.AllReduceStrategy.MIN_LATENCY" title="Link to this definition">#</a></dt>
@@ -6638,6 +6647,7 @@ function creates a constant tensor.</p></li>
 </li>
 <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.functional.AllReduceStrategy"><code class="docutils literal notranslate"><span class="pre">AllReduceStrategy</span></code></a><ul class="nav section-nav flex-column">
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.functional.AllReduceStrategy.AUTO"><code class="docutils literal notranslate"><span class="pre">AUTO</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.functional.AllReduceStrategy.LOWPRECISION"><code class="docutils literal notranslate"><span class="pre">LOWPRECISION</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.functional.AllReduceStrategy.MIN_LATENCY"><code class="docutils literal notranslate"><span class="pre">MIN_LATENCY</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.functional.AllReduceStrategy.NCCL"><code class="docutils literal notranslate"><span class="pre">NCCL</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.functional.AllReduceStrategy.ONESHOT"><code class="docutils literal notranslate"><span class="pre">ONESHOT</span></code></a></li>
@@ -6993,6 +7003,15 @@ function creates a constant tensor.</p></li>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/python-api/tensorrt_llm.layers.html b/latest/python-api/tensorrt_llm.layers.html
index f19e306101..d6fc118f27 100644
--- a/latest/python-api/tensorrt_llm.layers.html
+++ b/latest/python-api/tensorrt_llm.layers.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -2601,6 +2605,15 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/python-api/tensorrt_llm.models.html b/latest/python-api/tensorrt_llm.models.html
index 4603150395..994310abbd 100644
--- a/latest/python-api/tensorrt_llm.models.html
+++ b/latest/python-api/tensorrt_llm.models.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -3031,6 +3035,11 @@ ranges of the dimensions of when using TRT dynamic shapes.</p>
 <span class="sig-name descname"><span class="pre">MEDUSA</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">4</span></em><a class="headerlink" href="#tensorrt_llm.models.SpeculativeDecodingMode.MEDUSA" title="Link to this definition">#</a></dt>
 <dd></dd></dl>
 
+<dl class="py attribute">
+<dt class="sig sig-object py" id="tensorrt_llm.models.SpeculativeDecodingMode.NGRAM">
+<span class="sig-name descname"><span class="pre">NGRAM</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">64</span></em><a class="headerlink" href="#tensorrt_llm.models.SpeculativeDecodingMode.NGRAM" title="Link to this definition">#</a></dt>
+<dd></dd></dl>
+
 <dl class="py attribute">
 <dt class="sig sig-object py" id="tensorrt_llm.models.SpeculativeDecodingMode.NONE">
 <span class="sig-name descname"><span class="pre">NONE</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.models.SpeculativeDecodingMode.NONE" title="Link to this definition">#</a></dt>
@@ -3466,6 +3475,7 @@ ranges of the dimensions of when using TRT dynamic shapes.</p>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.models.SpeculativeDecodingMode.EXPLICIT_DRAFT_TOKENS"><code class="docutils literal notranslate"><span class="pre">EXPLICIT_DRAFT_TOKENS</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.models.SpeculativeDecodingMode.LOOKAHEAD_DECODING"><code class="docutils literal notranslate"><span class="pre">LOOKAHEAD_DECODING</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.models.SpeculativeDecodingMode.MEDUSA"><code class="docutils literal notranslate"><span class="pre">MEDUSA</span></code></a></li>
+<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.models.SpeculativeDecodingMode.NGRAM"><code class="docutils literal notranslate"><span class="pre">NGRAM</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.models.SpeculativeDecodingMode.NONE"><code class="docutils literal notranslate"><span class="pre">NONE</span></code></a></li>
 <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.models.SpeculativeDecodingMode.from_arguments"><code class="docutils literal notranslate"><span class="pre">from_arguments()</span></code></a></li>
 </ul>
@@ -3567,6 +3577,15 @@ ranges of the dimensions of when using TRT dynamic shapes.</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/python-api/tensorrt_llm.plugin.html b/latest/python-api/tensorrt_llm.plugin.html
index fc3f02c8d1..3ed44e3115 100644
--- a/latest/python-api/tensorrt_llm.plugin.html
+++ b/latest/python-api/tensorrt_llm.plugin.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -691,6 +695,15 @@ migrated to the centralized building script <cite>tensorrt_llm/commands/build.py
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/python-api/tensorrt_llm.quantization.html b/latest/python-api/tensorrt_llm.quantization.html
index c8639836ca..eb023c5228 100644
--- a/latest/python-api/tensorrt_llm.quantization.html
+++ b/latest/python-api/tensorrt_llm.quantization.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -729,6 +733,15 @@ the quantized model as TRT-LLM checkpoint</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/python-api/tensorrt_llm.runtime.html b/latest/python-api/tensorrt_llm.runtime.html
index a784554226..c4f6e49965 100644
--- a/latest/python-api/tensorrt_llm.runtime.html
+++ b/latest/python-api/tensorrt_llm.runtime.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -3271,6 +3275,15 @@ For example, word_dict[2] = [” I am happy”, “ I am sad”].</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/quick-start-guide.html b/latest/quick-start-guide.html
index c7fa260d17..597747797c 100644
--- a/latest/quick-start-guide.html
+++ b/latest/quick-start-guide.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -831,6 +835,15 @@ The model definition is a minimal example that shows some of the optimizations a
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/reference/memory.html b/latest/reference/memory.html
index 18d5ce4a34..299aadd787 100644
--- a/latest/reference/memory.html
+++ b/latest/reference/memory.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -778,6 +782,15 @@ Here some explanations on how these values affect the memory:</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/reference/precision.html b/latest/reference/precision.html
index c79d59135e..9952239e75 100644
--- a/latest/reference/precision.html
+++ b/latest/reference/precision.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -1274,6 +1278,15 @@ are:</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/reference/support-matrix.html b/latest/reference/support-matrix.html
index 2c1595d05c..667a4a5efa 100644
--- a/latest/reference/support-matrix.html
+++ b/latest/reference/support-matrix.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -934,6 +938,15 @@ In addition, older architectures can have limitations for newer software release
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/reference/troubleshooting.html b/latest/reference/troubleshooting.html
index 5356d7288f..3137b6b16d 100644
--- a/latest/reference/troubleshooting.html
+++ b/latest/reference/troubleshooting.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -964,6 +968,15 @@ dedicated MPI environment, not the one provided by your Slurm allocation.</p>
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/release-notes.html b/latest/release-notes.html
index 515c325086..6dc54b2080 100644
--- a/latest/release-notes.html
+++ b/latest/release-notes.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -2041,6 +2045,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/search.html b/latest/search.html
index 2537c79245..44d45c0d6e 100644
--- a/latest/search.html
+++ b/latest/search.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -69,7 +69,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -338,6 +338,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -359,6 +360,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -423,6 +425,7 @@
 <li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -457,6 +460,7 @@
 <li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -619,6 +623,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/searchindex.js b/latest/searchindex.js
index e8012775e0..16ebaf26ff 100644
--- a/latest/searchindex.js
+++ b/latest/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"alltitles": {"1. Download TensorRT-LLM": [[18, "download-tensorrt-llm"]], "1. Weights size": [[84, "weights-size"]], "2. Activation size": [[84, "activation-size"]], "2. Download the DeepSeek R1 models": [[18, "download-the-deepseek-r1-models"]], "3. Build and run TensorRT-LLM container": [[18, "build-and-run-tensorrt-llm-container"]], "3. I/O tensors": [[84, "i-o-tensors"]], "3.1 Runtime and decoder buffers except KV cache tensor": [[84, "runtime-and-decoder-buffers-except-kv-cache-tensor"]], "3.2 KV cache tensor": [[84, "kv-cache-tensor"]], "4. Compile and Install TensorRT-LLM": [[18, "compile-and-install-tensorrt-llm"]], "5. Optional: Tune GPU clocks": [[18, "optional-tune-gpu-clocks"]], "6. Dataset preparation": [[18, "dataset-preparation"]], "@record_signature to Decorate Functionals Requiring FLayerInfo": [[7, "record-signature-to-decorate-functionals-requiring-flayerinfo"]], "ALiBi": [[5, "alibi"]], "API": [[3, "api"]], "API Changes": [[11, "api-changes"], [88, "api-changes"], [88, "id9"], [88, "id14"], [88, "id19"], [88, "id24"], [88, "id31"], [88, "id36"], [88, "id42"], [88, "id48"], [88, "id54"]], "API Introduction": [[64, null]], "API Reference": [[65, null]], "AWQ Quantization Scaling Factors": [[13, "awq-quantization-scaling-factors"]], "About": [[26, "about"]], "About Speculative Sampling": [[10, "about-speculative-sampling"]], "About TensorRT-LLM": [[66, "about-tensorrt-llm"]], "Accuracy": [[23, "accuracy"]], "Acknowledgment": [[24, "acknowledgment"]], "Activation": [[78, "module-tensorrt_llm.layers.activation"]], "Adding a Model": [[12, null]], "Adding a New Model in PyTorch Backend": [[90, null]], "Advanced": [[59, null]], "Announcements": [[88, "announcements"], [88, "id52"]], "Architecture": [[59, null]], "Architecture Ovewiew": [[91, null]], "Asyncio-Based Generation": [[32, "asyncio-based-generation"]], "Attention": [[78, "module-tensorrt_llm.layers.attention"], [92, null]], "Attention Backends": [[92, "attention-backends"]], "Attention Kernel": [[24, "attention-kernel"]], "Attention Weights": [[13, "attention-weights"]], "Auto parallel arguments": [[25, "tensorrt_llm.commands.build-parse_arguments-auto-parallel-arguments"]], "Automatic Parallelism with LLM": [[38, null]], "Autoregressive MTP Layers": [[24, "autoregressive-mtp-layers"]], "B200 max-throughput": [[18, "b200-max-throughput"]], "B200 min-latency": [[18, "b200-min-latency"]], "Background": [[24, "background"]], "Beam-Search": [[5, "beam-search"]], "Before Benchmarking": [[68, "before-benchmarking"]], "Before You Begin: TensorRT-LLM LLM-API": [[70, "before-you-begin-tensorrt-llm-llm-api"]], "Benchmark": [[18, "benchmark"], [23, "benchmark"], [26, "benchmark"]], "Benchmarking Default Performance": [[70, null]], "Benchmarking a non-Medusa Low Latency Engine": [[68, "benchmarking-a-non-medusa-low-latency-engine"]], "Benchmarking with trtllm-bench": [[70, "benchmarking-with-trtllm-bench"]], "Benchmarks": [[2, "benchmarks"]], "Best practices to choose the right quantization methods": [[23, "best-practices-to-choose-the-right-quantization-methods"]], "Boost settings": [[68, "boost-settings"]], "Build APIs": [[17, "build-apis"]], "Build Checkpoint into TensorRT Engine": [[13, "build-checkpoint-into-tensorrt-engine"]], "Build Configuration": [[32, "build-configuration"]], "Build TensorRT-LLM": [[60, "build-tensorrt-llm"]], "Build the TensorRT-LLM Docker Image": [[27, null]], "Build the TensorRT-LLM Docker Image and Upload to DockerHub": [[27, "build-the-tensorrt-llm-docker-image-and-upload-to-dockerhub"], [28, "build-the-tensorrt-llm-docker-image-and-upload-to-dockerhub"]], "Building a Benchmark Engine": [[68, "building-a-benchmark-engine"]], "Building a Medusa Low-Latency Engine": [[68, "building-a-medusa-low-latency-engine"]], "Building a TensorRT-LLM Docker Image": [[60, "building-a-tensorrt-llm-docker-image"]], "Building and Saving Engines via CLI": [[70, "building-and-saving-engines-via-cli"]], "Building and Saving the Engine": [[70, "building-and-saving-the-engine"]], "Building from Source Code on Linux": [[60, null]], "Building the Python Bindings for the C++ Runtime": [[60, "building-the-python-bindings-for-the-c-runtime"]], "C++ Executor API Example": [[3, "c-executor-api-example"]], "C++ GPT Runtime": [[6, null]], "C++ runtime": [[84, "c-runtime"], [84, "id1"]], "CLI Tools": [[17, "cli-tools"]], "CUDA Graph & Programmatic Dependent Launch": [[24, "cuda-graph-programmatic-dependent-launch"]], "CUTLASS Backend (default backend)": [[24, "cutlass-backend-default-backend"]], "Capacity Scheduler Policy": [[76, "capacity-scheduler-policy"]], "Cast": [[78, "module-tensorrt_llm.layers.cast"]], "Chat API": [[26, "chat-api"]], "Chunked Context": [[5, "chunked-context"]], "Classical Workflow": [[7, "classical-workflow"]], "Closing": [[19, "closing"], [22, "closing"]], "Collect PyTorch profiler results": [[67, "collect-pytorch-profiler-results"]], "Command Overview": [[69, "command-overview"]], "Common LLM Support": [[66, "common-llm-support"]], "Communication Kernel": [[24, "communication-kernel"]], "Compilation": [[14, "compilation"]], "Compile the Model into a TensorRT Engine": [[83, "compile-the-model-into-a-tensorrt-engine"]], "Completions API": [[26, "completions-api"], [26, "id1"]], "Conclusion": [[72, "conclusion"], [74, "conclusion"], [75, "conclusion"]], "Config": [[13, "config"]], "Configure SSH Key": [[28, "configure-ssh-key"]], "Configure The Executor": [[3, "configure-the-executor"]], "Connect to the Pod": [[28, "connect-to-the-pod"]], "Context Chunking Policy": [[76, "context-chunking-policy"]], "Context Phase": [[5, "context-phase"]], "Context and Generation Phases": [[5, "context-and-generation-phases"]], "Contiguous KV Cache": [[5, "contiguous-kv-cache"]], "Control generated text using logits processor": [[47, null]], "Controlling output with Logits Post-Processor": [[3, "controlling-output-with-logits-post-processor"]], "Conv": [[78, "module-tensorrt_llm.layers.conv"]], "Conversion APIs": [[17, "conversion-apis"]], "Coordinating with NVIDIA Nsight Systems Launch": [[67, "coordinating-with-nvidia-nsight-systems-launch"]], "Coordinating with PyTorch profiler (PyTorch workflow only)": [[67, "coordinating-with-pytorch-profiler-pytorch-workflow-only"]], "Core Models": [[90, "core-models"]], "Create a Pod Template": [[28, "create-a-pod-template"]], "Create a Runpod account": [[28, "create-a-runpod-account"]], "Create the Container": [[60, "create-the-container"]], "Cross Attention": [[5, "cross-attention"]], "Curl Chat Client": [[29, null]], "Curl Chat Client For Multimodal": [[30, null]], "Curl Completion Client": [[31, null]], "Customize KV Cache Manager": [[93, "customize-kv-cache-manager"]], "Customize Your Own Scheduler": [[94, "customize-your-own-scheduler"]], "Debug Execution Errors": [[87, "debug-execution-errors"]], "Debug on E2E Models": [[87, "debug-on-e2e-models"]], "Debug on Unit Tests": [[87, "debug-on-unit-tests"]], "Debugging FAQs": [[2, "debugging-faqs"]], "Deciding Model Sharding Strategy": [[71, null]], "Decoder": [[91, "decoder"]], "Deepseek R1 Reasoning Parser": [[33, null]], "Default Build Behavior": [[68, "default-build-behavior"]], "Dense GEMM optimization": [[24, "dense-gemm-optimization"]], "Deploy with Triton Inference Server": [[83, "deploy-with-triton-inference-server"]], "Deploy with trtllm-serve": [[83, "deploy-with-trtllm-serve"]], "Develop TensorRT-LLM on Runpod": [[28, null]], "Developer Guide": [[89, "developer-guide"]], "Disable Tokenizer": [[32, "disable-tokenizer"]], "Disaggregated-Service (experimental)": [[2, null]], "Distributed LLM Generation": [[45, null]], "DoRA": [[9, "dora"]], "Documentation": [[88, "documentation"], [88, "id28"]], "Draft-Target-Model": [[10, "draft-target-model"]], "EAGLE": [[10, "eagle"]], "Embedding": [[78, "module-tensorrt_llm.layers.embedding"]], "Enable GIL information in NVTX markers": [[67, "enable-gil-information-in-nvtx-markers"]], "Enable garbage collection (GC) NVTX markers": [[67, "enable-garbage-collection-gc-nvtx-markers"]], "Enable kv cache reuse for p-tuning": [[8, "enable-kv-cache-reuse-for-p-tuning"]], "Enable more NVTX markers for debugging": [[67, "enable-more-nvtx-markers-for-debugging"]], "Enable ssh access to the container": [[27, "enable-ssh-access-to-the-container"]], "Enabling GEMM + SwiGLU Fusion": [[72, "enabling-gemm-swiglu-fusion"]], "Enabling GEMM Plugin": [[75, "enabling-gemm-plugin"]], "Enabling Low Latency GEMM plugin": [[72, "enabling-low-latency-gemm-plugin"]], "Enabling Paged Context Attention": [[75, "enabling-paged-context-attention"]], "Enabling Quantization": [[72, "enabling-quantization"]], "Enabling Quantized KV Cache": [[72, "enabling-quantized-kv-cache"]], "Enabling Reduce Norm Fusion Plugin": [[75, "enabling-reduce-norm-fusion-plugin"]], "Enabling Reduce Norm Fusion with User Buffers": [[72, "enabling-reduce-norm-fusion-with-user-buffers"]], "Enabling building with multiple profiles": [[75, "enabling-building-with-multiple-profiles"]], "Environment Variables": [[2, "environment-variables"]], "Everything in One Diagram": [[24, "everything-in-one-diagram"]], "Example": [[2, "example"], [13, "example"]], "Example LoRA tensors": [[9, "example-lora-tensors"]], "Example of Build Subcommand Output:": [[68, "example-of-build-subcommand-output"]], "Examples": [[14, "examples"], [15, "examples"], [67, "examples"]], "Executor": [[0, null]], "Executor API": [[3, null]], "Expected Result Format": [[18, "expected-result-format"], [18, "id1"], [18, "id2"]], "Expected Results": [[18, "expected-results"]], "Expert Parallelism in TensorRT-LLM": [[4, null]], "Exploring more ISL/OSL combinations": [[18, "exploring-more-isl-osl-combinations"]], "FAQ": [[84, "faq"]], "FLayerInfo for Retrieving High-Level Information for a Functional": [[7, "flayerinfo-for-retrieving-high-level-information-for-a-functional"]], "FP32, FP16 and BF16": [[85, "fp32-fp16-and-bf16"]], "FP4 Models:": [[69, "fp4-models"]], "FP8 (Hopper)": [[85, "fp8-hopper"]], "FP8 Context FMHA": [[5, "fp8-context-fmha"]], "FP8 Models:": [[69, "fp8-models"]], "FP8 Quantization": [[72, null]], "FP8 Quantization Scaling Factors": [[13, "fp8-quantization-scaling-factors"]], "FP8 Support": [[66, "fp8-support"]], "FP8 \u201cBaseline\u201d Performance": [[72, "fp8-baseline-performance"]], "Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100": [[19, null]], "Falcon-180B on a single H200 with INT4 AWQ": [[19, "falcon-180b-on-a-single-h200-with-int4-awq"]], "Feature Descriptions": [[67, "feature-descriptions"]], "Fixed Issues": [[88, "fixed-issues"], [88, "id11"], [88, "id15"], [88, "id21"], [88, "id26"], [88, "id33"], [88, "id38"], [88, "id44"], [88, "id50"], [88, "id56"], [88, "id61"]], "Fully customized": [[15, "fully-customized"]], "Functionals": [[77, null]], "Fuse_A_GEMM": [[24, "fuse-a-gemm"]], "Future Works": [[24, "future-works"]], "Future-Style Generation": [[32, "future-style-generation"]], "GEMM + SwiGLU Fusion in Gated-MLP": [[72, "gemm-swiglu-fusion-in-gated-mlp"]], "GEMM Plugin": [[75, "gemm-plugin"]], "GPTQ and AWQ (W4A16)": [[85, "gptq-and-awq-w4a16"]], "GPU Clock Management": [[68, "gpu-clock-management"]], "Genai Perf Client": [[34, null]], "Genai Perf Client For Multimodal": [[35, null]], "General FAQs": [[2, "general-faqs"]], "Generate Text Asynchronously": [[42, null]], "Generate Text Using Eagle Decoding": [[39, null]], "Generate Text Using Lookahead Decoding": [[48, null]], "Generate Text Using Medusa Decoding": [[49, null]], "Generate Text in Streaming": [[43, null]], "Generate text": [[41, null]], "Generate text with customization": [[44, null]], "Generate text with guided decoding": [[40, null]], "Generate text with multiple LoRA adapters": [[53, null]], "Generation": [[32, "generation"]], "Generation Phase": [[5, "generation-phase"]], "Generation with Quantization": [[54, null]], "Get KV Cache Events": [[46, null]], "Getting Started": [[59, null]], "Graph Rewriting APIs": [[7, "graph-rewriting-apis"]], "Graph Rewriting Module": [[7, null]], "Grouped GEMM": [[24, "grouped-gemm"]], "H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token": [[20, null]], "H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM": [[21, null]], "H200 max-throughput": [[18, "h200-max-throughput"]], "H200 min-latency": [[18, "h200-min-latency"]], "H200 vs H100": [[21, "h200-vs-h100"]], "Hardware": [[86, "hardware"]], "How the Benchmarker Works": [[68, "how-the-benchmarker-works"]], "How to Enable": [[4, "how-to-enable"]], "How to Think about Model Sharding: Communication is Key": [[71, "how-to-think-about-model-sharding-communication-is-key"]], "How to change Max Batch Size": [[74, "how-to-change-max-batch-size"]], "How to change Max Num Tokens": [[74, "how-to-change-max-num-tokens"]], "How to enable kv cache reuse": [[8, "how-to-enable-kv-cache-reuse"]], "How to get best performance on DeepSeek-R1 in TensorRT-LLM": [[18, null]], "How to reproduce": [[24, "how-to-reproduce"]], "How to set Tensor Parallelism and Pipeline Parallelism": [[71, "how-to-set-tensor-parallelism-and-pipeline-parallelism"]], "Hugging Face Hub": [[64, "hugging-face-hub"]], "INT4 and INT8 Weight-Only (W4A16 and W8A16)": [[85, "int4-and-int8-weight-only-w4a16-and-w8a16"]], "INT8 SmoothQuant (W8A8)": [[85, "int8-smoothquant-w8a8"]], "INT8/FP8 KV Caches": [[5, "int8-fp8-kv-caches"]], "Implement AttentionBackend": [[92, "implement-attentionbackend"]], "Implement AttentionMetadata": [[92, "implement-attentionmetadata"]], "Implement a New Attention Backend": [[92, "implement-a-new-attention-backend"]], "Implementation Configuration": [[24, "implementation-configuration"]], "Important Note": [[5, "important-note"]], "In-Flight Batching and Paged Attention": [[66, "in-flight-batching-and-paged-attention"]], "In-flight Batching": [[5, "in-flight-batching"]], "In-flight Batching with the Triton Inference Server": [[3, "in-flight-batching-with-the-triton-inference-server"]], "Indices and tables": [[59, "indices-and-tables"]], "Inference Endpoints": [[26, "inference-endpoints"]], "Infrastructure Changes": [[88, "infrastructure-changes"], [88, "id4"], [88, "id7"], [88, "id12"], [88, "id16"], [88, "id22"], [88, "id27"], [88, "id34"], [88, "id39"], [88, "id45"]], "Infrastructure changes": [[88, "id51"]], "Input QKV tensor": [[5, "input-qkv-tensor"]], "Installation": [[59, null]], "Installation Errors": [[87, "installation-errors"]], "Installing on Grace Hopper": [[61, null]], "Installing on Linux": [[62, null]], "Interfaces": [[93, "interfaces"]], "Internal Components": [[6, "internal-components"]], "Introduction": [[90, "introduction"]], "KV Cache": [[5, "kv-cache"]], "KV Cache Manager": [[93, null]], "KV Cache Manager Introduction": [[93, "kv-cache-manager-introduction"]], "KV Cache Quantization Scaling Factors": [[13, "kv-cache-quantization-scaling-factors"]], "KV cache reuse": [[8, null]], "KVCacheManager": [[91, "kvcachemanager"]], "Kernel Level optimizations": [[24, "kernel-level-optimizations"]], "Kernel fusion": [[24, "kernel-fusion"]], "Key Components": [[89, "key-components"]], "Key Features": [[63, null]], "Key Features and Enhancements": [[88, "key-features-and-enhancements"], [88, "id2"], [88, "id3"], [88, "id5"], [88, "id8"], [88, "id13"], [88, "id18"], [88, "id23"], [88, "id30"], [88, "id35"], [88, "id41"], [88, "id47"], [88, "id53"], [88, "id57"], [88, "id59"]], "Key Optimizations": [[24, "key-optimizations"]], "Known Issues": [[84, "known-issues"], [88, "known-issues"], [88, "id6"], [88, "id10"], [88, "id17"], [88, "id29"], [88, "id40"], [88, "id46"], [88, "id62"], [89, "known-issues"]], "Known Limitations": [[60, "known-limitations"]], "LLM API": [[83, "llm-api"]], "LLM API Examples": [[36, null]], "LLM Common Customizations": [[32, null]], "LLM Examples": [[37, null]], "LLM Examples Introduction": [[36, null]], "LLM Models": [[86, "llm-models"]], "Latest GPU Support": [[66, "latest-gpu-support"]], "Latest HBM Memory": [[21, "latest-hbm-memory"]], "LayerNorm Weights": [[13, "layernorm-weights"]], "Layers": [[78, null]], "Limitations": [[10, "limitations"], [88, "limitations"]], "Limitations and Caveats": [[68, "limitations-and-caveats"]], "Linear": [[78, "module-tensorrt_llm.layers.linear"]], "Linking with the TensorRT-LLM C++ Runtime": [[60, "linking-with-the-tensorrt-llm-c-runtime"]], "Llama 3.1 405B": [[14, "llama-3-1-405b"]], "Llama 3.1 405B FP4": [[69, "llama-3-1-405b-fp4"]], "Llama 3.1 405B FP8": [[69, "llama-3-1-405b-fp8"]], "Llama 3.1 70B": [[14, "llama-3-1-70b"]], "Llama 3.1 70B FP8": [[69, "llama-3-1-70b-fp8"]], "Llama 3.1 8B FP8": [[69, "llama-3-1-8b-fp8"]], "Llama 3.3 70B FP4": [[69, "llama-3-3-70b-fp4"]], "Llama-70B on H200 up to 2.4x increased throughput with XQA within same latency budget": [[22, "llama-70b-on-h200-up-to-2-4x-increased-throughput-with-xqa-within-same-latency-budget"]], "Llama-70B on H200 up to 6.7x A100": [[19, "llama-70b-on-h200-up-to-6-7x-a100"]], "Llm Mgmn Llm Distributed": [[50, null]], "Llm Mgmn Trtllm Bench": [[51, null]], "Llm Mgmn Trtllm Serve": [[52, null]], "LoRA Module id mapping": [[9, "lora-module-id-mapping"]], "LoRA arguments": [[25, "tensorrt_llm.commands.build-parse_arguments-lora-arguments"]], "LoRA tensor format details": [[9, "lora-tensor-format-details"]], "LoRA with tensor parallel": [[9, "lora-with-tensor-parallel"]], "Loading function": [[15, "loading-function"]], "Local Hugging Face Models": [[64, "local-hugging-face-models"]], "Local TensorRT-LLM Engine": [[64, "local-tensorrt-llm-engine"]], "Logits arguments": [[25, "tensorrt_llm.commands.build-parse_arguments-logits-arguments"]], "Lookahead Decoding": [[10, "lookahead-decoding"]], "LoraCache configuration": [[9, "loracache-configuration"]], "Low Latency Benchmark": [[68, "low-latency-benchmark"]], "Low Latency GEMM Plugin": [[72, "low-latency-gemm-plugin"]], "Low Latency TensorRT-LLM Engine for Llama-3 70B": [[68, "low-latency-tensorrt-llm-engine-for-llama-3-70b"]], "MLP": [[78, "module-tensorrt_llm.layers.mlp"]], "MLP Weights": [[13, "mlp-weights"]], "MLPerf on H100 with FP8": [[20, "mlperf-on-h100-with-fp8"]], "MTP": [[24, "mtp"]], "Make Evaluation": [[13, "make-evaluation"]], "Mark Tensors As Output": [[3, "mark-tensors-as-output"]], "Max Throughput Benchmark": [[68, "max-throughput-benchmark"]], "Max Tokens in Paged KV Cache and KV Cache Free GPU Memory Fraction": [[76, "max-tokens-in-paged-kv-cache-and-kv-cache-free-gpu-memory-fraction"]], "Maximum Attention Window Size": [[76, "maximum-attention-window-size"]], "Medusa": [[10, "medusa"]], "Medusa Tree": [[10, "medusa-tree"]], "Memory Usage of TensorRT-LLM": [[84, null]], "Memory pool": [[84, "memory-pool"]], "Metrics Endpoint": [[26, "metrics-endpoint"]], "Mixed ETP": [[24, "mixed-etp"]], "Mixture of Experts (MoE)": [[4, "mixture-of-experts-moe"]], "Model Architecture": [[24, "model-architecture"]], "Model Configuration": [[6, "model-configuration"], [90, "model-configuration"]], "Model Definition": [[14, null], [90, "model-definition"]], "Model Definition API": [[83, "model-definition-api"]], "Model Engine": [[14, "model-engine"], [91, "model-engine"]], "Model Preparation": [[64, "model-preparation"]], "Model Registration": [[90, "model-registration"]], "Model Updates": [[88, "model-updates"], [88, "id20"], [88, "id25"], [88, "id32"], [88, "id37"], [88, "id43"], [88, "id49"], [88, "id55"], [88, "id58"], [88, "id60"]], "Model Weights": [[16, "model-weights"]], "Models": [[79, null]], "Models (PyTorch Backend)": [[86, "models-pytorch-backend"]], "Models (TensorRT Backend)": [[86, "models-tensorrt-backend"]], "Models with customized key names": [[15, "models-with-customized-key-names"]], "Models with customized weight layout": [[15, "models-with-customized-weight-layout"]], "Multi-GPU Multi-Node Inference": [[66, "multi-gpu-multi-node-inference"]], "Multi-GPU and Multi-Node Support": [[14, "multi-gpu-and-multi-node-support"]], "Multi-Head, Multi-Query, and Group-Query Attention": [[5, null]], "Multi-Modal Models 3": [[86, "multi-modal-models"]], "Multi-node Serving with Slurm": [[26, "multi-node-serving-with-slurm"]], "Multi-streams": [[24, "multi-streams"]], "Multimodal Serving": [[26, "multimodal-serving"]], "Multiple Profiles": [[75, "multiple-profiles"]], "NVFP4 (Blackwell)": [[85, "nvfp4-blackwell"]], "Named Arguments": [[25, "tensorrt_llm.commands.build-parse_arguments-named-arguments"]], "Native Windows Support": [[66, "native-windows-support"]], "Natively supported models": [[15, "natively-supported-models"]], "New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget": [[22, null]], "Next Steps": [[83, "next-steps"]], "Normalization": [[78, "module-tensorrt_llm.layers.normalization"]], "Note on context outputs": [[3, "note-on-context-outputs"]], "Numerical Precision": [[85, null]], "Obtaining Arbitrary Output Tensors": [[3, "obtaining-arbitrary-output-tensors"]], "Offloading to host memory": [[8, "offloading-to-host-memory"]], "Online Serving Examples": [[58, null]], "Only collect specific iterations": [[67, "only-collect-specific-iterations"]], "OpenAI Chat Client": [[55, null], [56, null]], "OpenAI Completion Client": [[57, null]], "Option 1: Build TensorRT-LLM in One Step": [[60, "option-1-build-tensorrt-llm-in-one-step"]], "Option 1: Full Build with C++ Compilation": [[60, "option-1-full-build-with-c-compilation"]], "Option 2: Build TensorRT-LLM Step-by-Step": [[60, "option-2-build-tensorrt-llm-step-by-step"]], "Option 2: Python-Only Build without C++ Compilation": [[60, "option-2-python-only-build-without-c-compilation"]], "Other Build Modes": [[68, "other-build-modes"]], "Out of memory issues": [[18, "out-of-memory-issues"]], "Out-of-Tree Models": [[90, "out-of-tree-models"]], "Overview": [[6, "overview"], [13, "overview"], [15, "overview"], [17, "overview"], [66, null], [69, null]], "Padded and Packed Tensors": [[5, "padded-and-packed-tensors"]], "Paged Context Attention": [[75, "paged-context-attention"]], "Paged KV Cache": [[5, "paged-kv-cache"]], "Parallelism Mapping Support": [[68, "parallelism-mapping-support"]], "Parallelism Strategy": [[24, "parallelism-strategy"]], "Pattern and Pattern Manager": [[7, "pattern-and-pattern-manager"]], "Pattern-Matching and Fusion": [[14, "pattern-matching-and-fusion"]], "Performance": [[23, "performance"], [59, null], [75, "performance"]], "Performance Analysis": [[67, null]], "Performance Improvements": [[10, "performance-improvements"]], "Performance Tuning Guide": [[73, null]], "Performance expectations": [[8, "performance-expectations"]], "Performance with GEMM + SwiGLU Fusion": [[72, "performance-with-gemm-swiglu-fusion"]], "Performance with GEMM Plugin": [[75, "performance-with-gemm-plugin"]], "Performance with Low Latency GEMM plugin": [[72, "performance-with-low-latency-gemm-plugin"]], "Performance with Quantized KV Cache": [[72, "performance-with-quantized-kv-cache"]], "Performance with Reduce Norm Fusion": [[75, "performance-with-reduce-norm-fusion"]], "Performance with Reduce Norm Fusion + User Buffers:": [[72, "performance-with-reduce-norm-fusion-user-buffers"]], "Performance with multiple profiles": [[75, "performance-with-multiple-profiles"]], "Persistence mode": [[68, "persistence-mode"]], "Pipeline Parallel Reduce Scatter Optimization": [[75, "pipeline-parallel-reduce-scatter-optimization"]], "Plugin": [[80, null]], "Plugin config arguments": [[25, "tensorrt_llm.commands.build-parse_arguments-plugin-config-arguments"]], "Plugins": [[14, "plugins"]], "Pooling": [[78, "module-tensorrt_llm.layers.pooling"]], "Postprocessing functions": [[15, "postprocessing-functions"]], "Precision Strategy": [[24, "precision-strategy"]], "Prepare": [[28, "prepare"]], "Prepare Dataset": [[70, "prepare-dataset"]], "Prepare the TensorRT-LLM Checkpoint": [[13, "prepare-the-tensorrt-llm-checkpoint"]], "Preparing a Dataset": [[68, "preparing-a-dataset"], [69, "preparing-a-dataset"]], "Prerequisite Knowledge": [[73, "prerequisite-knowledge"]], "Prerequisites": [[60, "prerequisites"], [83, "prerequisites"], [90, "prerequisites"]], "Prerequisites: Install TensorRT-LLM and download models": [[18, "prerequisites-install-tensorrt-llm-and-download-models"]], "Profiling specific iterations on a trtllm-bench/trtllm-serve run": [[67, "profiling-specific-iterations-on-a-trtllm-bench-trtllm-serve-run"]], "Prompt-Lookup-Decoding": [[10, "prompt-lookup-decoding"]], "Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs": [[24, null]], "PyExecutor": [[91, "pyexecutor"]], "PyTorch Backend": [[89, null]], "Python Bindings for the Executor API": [[3, "python-bindings-for-the-executor-api"]], "Python runtime (Not recommended to be used)": [[84, "python-runtime-not-recommended-to-be-used"]], "Quantization": [[32, "quantization"], [81, null], [89, "quantization"]], "Quantization APIs": [[17, "quantization-apis"]], "Quantization and Dequantization (Q/DQ)": [[85, "quantization-and-dequantization-q-dq"]], "Quantization in TensorRT-LLM": [[23, "quantization-in-tensorrt-llm"]], "Quantization in the PyTorch Flow": [[68, "quantization-in-the-pytorch-flow"]], "Quantized KV-Cache": [[72, "quantized-kv-cache"]], "Quick Start": [[89, "quick-start"]], "Quick Start Guide": [[83, null]], "Quickstart": [[68, "quickstart"]], "Rank Weights": [[13, "rank-weights"]], "Re-balanced the sparse experts": [[24, "re-balanced-the-sparse-experts"]], "ReDrafter": [[10, "redrafter"]], "Reduce Norm Fusion Plugin for Llama models:": [[75, "reduce-norm-fusion-plugin-for-llama-models"]], "Reduce Norm Fusion with User Buffers for Llama Models": [[72, "reduce-norm-fusion-with-user-buffers-for-llama-models"]], "Reference": [[12, "reference"], [59, null]], "Related Information": [[83, "related-information"]], "Relative Attention Bias (RAB)": [[5, "relative-attention-bias-rab"]], "Relax Acceptance Verification": [[24, "relax-acceptance-verification"]], "Release Notes": [[88, null]], "Reproducing Benchmarked Results": [[69, "reproducing-benchmarked-results"]], "Reproducing steps": [[18, "reproducing-steps"]], "Request Additional Output": [[3, "request-additional-output"]], "ResourceManager": [[91, "resourcemanager"]], "Results": [[70, "results"]], "Revisiting Paged Context Attention and Context Chunking": [[74, "revisiting-paged-context-attention-and-context-chunking"]], "Rotary Positional Embedding (RoPE)": [[5, "rotary-positional-embedding-rope"]], "RouterGEMM": [[24, "routergemm"]], "Run gpt-2b + LoRA using Executor / cpp runtime": [[9, null]], "Run the Model": [[83, "run-the-model"]], "Running Throughput and Latency Benchmarks": [[70, "running-throughput-and-latency-benchmarks"]], "Running With Weight Streaming to Reduce GPU Memory Consumption": [[11, null]], "Running multi-modal models in the PyTorch Workflow": [[68, "running-multi-modal-models-in-the-pytorch-workflow"]], "Running the Benchmark": [[69, "running-the-benchmark"]], "Running with the PyTorch Workflow": [[68, "running-with-the-pytorch-workflow"]], "Runtime": [[1, null], [14, "runtime"], [82, null]], "Runtime Customization": [[32, "runtime-customization"]], "Sampling": [[32, "sampling"]], "Sampling Parameters": [[6, "sampling-parameters"]], "Scaling factor(s)": [[5, "scaling-factor-s"]], "Scheduler": [[91, "scheduler"], [94, null]], "Scheduler Introduction": [[94, "scheduler-introduction"]], "Scripts": [[37, null], [58, null]], "Sending Requests with Different Beam Widths": [[3, "sending-requests-with-different-beam-widths"]], "Set power limits": [[68, "set-power-limits"]], "Situations that can prevent kv cache reuse": [[8, "situations-that-can-prevent-kv-cache-reuse"]], "Sliding Window Attention, Cyclic (Rolling Buffer) KV Cache": [[5, "sliding-window-attention-cyclic-rolling-buffer-kv-cache"]], "Smart Router": [[24, "smart-router"]], "Software": [[86, "software"]], "Sparse Experts as GEMMs (only works when moe_backend=CUTLASS)": [[24, "sparse-experts-as-gemms-only-works-when-moe-backend-cutlass"]], "Speculative Sampling": [[10, null]], "Speculative decoding arguments": [[25, "tensorrt_llm.commands.build-parse_arguments-speculative-decoding-arguments"]], "Speed up inference with SOTA quantization techniques in TRT-LLM": [[23, null]], "Starting a Server": [[26, "starting-a-server"]], "Step 1. Write Modeling Part": [[12, "step-1-write-modeling-part"]], "Step 2. Implement Weight Conversion": [[12, "step-2-implement-weight-conversion"]], "Step 3. Register New Model": [[12, "step-3-register-new-model"]], "Step 4. Verify New Model": [[12, "step-4-verify-new-model"]], "Step-by-Step Guide": [[90, "step-by-step-guide"]], "StreamingLLM": [[5, "streamingllm"]], "Structured output with guided decoding": [[3, "structured-output-with-guided-decoding"]], "Summary": [[68, "summary"]], "Summary of Configuration Option Recommendations:": [[72, "summary-of-configuration-option-recommendations"], [75, "summary-of-configuration-option-recommendations"]], "Support Matrix": [[86, null]], "Support matrix": [[85, "support-matrix"]], "Supported C++ Header Files": [[60, "supported-c-header-files"]], "Supported Models": [[64, "supported-models"]], "Supported Quantization Modes": [[68, "supported-quantization-modes"]], "Syntax": [[26, "syntax"]], "System Level optimizations": [[24, "system-level-optimizations"]], "TRTLLM Backend": [[24, "trtllm-backend"]], "Table of Contents": [[18, "table-of-contents"], [24, "table-of-contents"], [73, "table-of-contents"], [90, "table-of-contents"]], "Technical Detail: The QuantMode Flags": [[85, "technical-detail-the-quantmode-flags"]], "Tensor Parallel vs Expert Parallel": [[4, "tensor-parallel-vs-expert-parallel"]], "Tensor-Related Methods": [[7, "tensor-related-methods"]], "TensorRT Compiler": [[14, "tensorrt-compiler"]], "TensorRT-LLM Architecture": [[16, null]], "TensorRT-LLM Benchmarking": [[68, null]], "TensorRT-LLM Build Workflow": [[17, null]], "TensorRT-LLM Checkpoint": [[13, null]], "TensorRT-LLM Model Weights Loader": [[15, null]], "TensorRT-LLM Release 0.10.0": [[88, "tensorrt-llm-release-0-10-0"]], "TensorRT-LLM Release 0.11.0": [[88, "tensorrt-llm-release-0-11-0"]], "TensorRT-LLM Release 0.12.0": [[88, "tensorrt-llm-release-0-12-0"]], "TensorRT-LLM Release 0.13.0": [[88, "tensorrt-llm-release-0-13-0"]], "TensorRT-LLM Release 0.14.0": [[88, "tensorrt-llm-release-0-14-0"]], "TensorRT-LLM Release 0.15.0": [[88, "tensorrt-llm-release-0-15-0"]], "TensorRT-LLM Release 0.16.0": [[88, "tensorrt-llm-release-0-16-0"]], "TensorRT-LLM Release 0.17.0": [[88, "tensorrt-llm-release-0-17-0"]], "TensorRT-LLM Release 0.18.0": [[88, "tensorrt-llm-release-0-18-0"]], "TensorRT-LLM Release 0.18.1": [[88, "tensorrt-llm-release-0-18-1"]], "TensorRT-LLM Release 0.18.2": [[88, "tensorrt-llm-release-0-18-2"]], "TensorRT-LLM Release 0.19.0": [[88, "tensorrt-llm-release-0-19-0"]], "TensorRT-LLM Release 0.7.1": [[88, "tensorrt-llm-release-0-7-1"]], "TensorRT-LLM Release 0.8.0": [[88, "tensorrt-llm-release-0-8-0"]], "TensorRT-LLM Release 0.9.0": [[88, "tensorrt-llm-release-0-9-0"]], "The Executor Class": [[3, "the-executor-class"]], "The Request Class": [[3, "the-request-class"]], "The Response Class": [[3, "the-response-class"]], "The Result Class": [[3, "the-result-class"]], "Throughput Benchmarking": [[68, "throughput-benchmarking"]], "Throughput Measurements": [[69, "throughput-measurements"]], "Tips": [[87, "tips"]], "Tips and Troubleshooting": [[64, "tips-and-troubleshooting"]], "Tokenizer Customization": [[32, "tokenizer-customization"]], "Top Level API": [[91, "top-level-api"]], "Translator": [[15, "translator"]], "Trouble shooting": [[15, "trouble-shooting"]], "Troubleshooting": [[87, null]], "Troubleshooting Tips and Pitfalls To Avoid": [[70, "troubleshooting-tips-and-pitfalls-to-avoid"]], "Troubleshooting and FAQ": [[2, "troubleshooting-and-faq"]], "Tuning Case Study": [[74, "tuning-case-study"], [74, "id2"]], "Tuning Max Batch Size": [[74, "tuning-max-batch-size"]], "Tuning Max Batch Size and Max Num Tokens": [[74, null]], "Tuning Max Num Tokens": [[74, "tuning-max-num-tokens"]], "Understand inference time GPU memory usage": [[84, "understand-inference-time-gpu-memory-usage"]], "Understanding the TensorRT-LLM scheduler": [[74, "understanding-the-tensorrt-llm-scheduler"]], "Upload the Docker Image to DockerHub": [[27, "upload-the-docker-image-to-dockerhub"]], "Usage": [[2, "usage"]], "Useful Build-Time Flags": [[75, null]], "Useful Runtime Options": [[76, null]], "Using Medusa with TensorRT-LLM": [[10, "using-medusa-with-tensorrt-llm"]], "Validated Networks for Benchmarking": [[68, "validated-networks-for-benchmarking"]], "Variables": [[69, "variables"]], "Visualize the PyTorch profiler results": [[67, "visualize-the-pytorch-profiler-results"]], "WIP: Chunked context support on DeepSeek models": [[18, "wip-chunked-context-support-on-deepseek-models"]], "WIP: Enable more features by default": [[18, "wip-enable-more-features-by-default"]], "Weight Bindings": [[14, "weight-bindings"]], "Weight Loading": [[90, "weight-loading"]], "Welcome to TensorRT-LLM\u2019s Documentation!": [[59, null]], "What Can You Do With TensorRT-LLM?": [[66, "what-can-you-do-with-tensorrt-llm"]], "What is H100 FP8?": [[20, "what-is-h100-fp8"]], "What\u2019s coming next": [[23, "whats-coming-next"]], "When to Use Graph Rewriting?": [[7, "when-to-use-graph-rewriting"]], "Workflow": [[15, "workflow"], [68, "workflow"]], "Workload Profile": [[24, "workload-profile"]], "World Configuration": [[6, "world-configuration"]], "XQA Optimization": [[5, "xqa-optimization"]], "bufferManager.h": [[1, "buffermanager-h"]], "cacheCommunicator.h": [[0, "cachecommunicator-h"]], "common.h": [[1, "common-h"]], "cudaEvent.h": [[1, "cudaevent-h"]], "cudaStream.h": [[1, "cudastream-h"]], "dataTransceiverState.h": [[0, "datatransceiverstate-h"]], "decoderState.h": [[1, "decoderstate-h"]], "decodingInput.h": [[1, "decodinginput-h"]], "decodingOutput.h": [[1, "decodingoutput-h"]], "disaggServerUtil.h": [[0, "disaggserverutil-h"]], "disaggregated": [[26, "trtllm-serve-disaggregated"]], "disaggregated_mpi_worker": [[26, "trtllm-serve-disaggregated-mpi-worker"]], "eagleBuffers.h": [[1, "eaglebuffers-h"]], "eagleModule.h": [[1, "eaglemodule-h"]], "executor.h": [[0, "executor-h"]], "explicitDraftTokensBuffers.h": [[1, "explicitdrafttokensbuffers-h"]], "gptDecoder.h": [[1, "gptdecoder-h"]], "gptDecoderBatched.h": [[1, "gptdecoderbatched-h"]], "gptJsonConfig.h": [[1, "gptjsonconfig-h"]], "iBuffer.h": [[1, "ibuffer-h"]], "iGptDecoderBatched.h": [[1, "igptdecoderbatched-h"]], "iTensor.h": [[1, "itensor-h"]], "ipcNvlsMemory.h": [[1, "ipcnvlsmemory-h"]], "ipcUtils.h": [[1, "ipcutils-h"]], "lookaheadBuffers.h": [[1, "lookaheadbuffers-h"]], "lookaheadModule.h": [[1, "lookaheadmodule-h"]], "loraCache.h": [[1, "loracache-h"]], "loraCachePageManagerConfig.h": [[1, "loracachepagemanagerconfig-h"]], "loraModule.h": [[1, "loramodule-h"]], "medusaModule.h": [[1, "medusamodule-h"]], "memoryCounters.h": [[1, "memorycounters-h"]], "modelConfig.h": [[1, "modelconfig-h"]], "promptTuningParams.h": [[1, "prompttuningparams-h"]], "rawEngine.h": [[1, "rawengine-h"]], "request.h": [[1, "request-h"]], "runtimeDefaults.h": [[1, "runtimedefaults-h"]], "samplingConfig.h": [[1, "samplingconfig-h"]], "serialization.h": [[0, "serialization-h"]], "serve": [[26, "trtllm-serve-serve"]], "speculativeDecodingMode.h": [[1, "speculativedecodingmode-h"]], "speculativeDecodingModule.h": [[1, "speculativedecodingmodule-h"]], "tensor.h": [[0, "tensor-h"]], "tllmLogger.h": [[1, "tllmlogger-h"]], "trtllm-build": [[25, null]], "trtllm-serve": [[26, null], [26, "trtllm-serve"]], "types.h": [[0, "types-h"]], "worldConfig.h": [[1, "worldconfig-h"]]}, "docnames": ["_cpp_gen/executor", "_cpp_gen/runtime", "advanced/disaggregated-service", "advanced/executor", "advanced/expert-parallelism", "advanced/gpt-attention", "advanced/gpt-runtime", "advanced/graph-rewriting", "advanced/kv-cache-reuse", "advanced/lora", "advanced/speculative-decoding", "advanced/weight-streaming", "architecture/add-model", "architecture/checkpoint", "architecture/core-concepts", "architecture/model-weights-loader", "architecture/overview", "architecture/workflow", "blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM", "blogs/Falcon180B-H200", "blogs/H100vsA100", "blogs/H200launch", "blogs/XQA-kernel", "blogs/quantization-in-TRT-LLM", "blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs", "commands/trtllm-build", "commands/trtllm-serve", "dev-on-cloud/build-image-to-dockerhub", "dev-on-cloud/dev-on-runpod", "examples/curl_chat_client", "examples/curl_chat_client_for_multimodal", "examples/curl_completion_client", "examples/customization", "examples/deepseek_r1_reasoning_parser", "examples/genai_perf_client", "examples/genai_perf_client_for_multimodal", "examples/index", "examples/llm_api_examples", "examples/llm_auto_parallel", "examples/llm_eagle_decoding", "examples/llm_guided_decoding", "examples/llm_inference", "examples/llm_inference_async", "examples/llm_inference_async_streaming", "examples/llm_inference_customize", "examples/llm_inference_distributed", "examples/llm_inference_kv_events", "examples/llm_logits_processor", "examples/llm_lookahead_decoding", "examples/llm_medusa_decoding", "examples/llm_mgmn_llm_distributed", "examples/llm_mgmn_trtllm_bench", "examples/llm_mgmn_trtllm_serve", "examples/llm_multilora", "examples/llm_quantization", "examples/openai_chat_client", "examples/openai_chat_client_for_multimodal", "examples/openai_completion_client", "examples/trtllm_serve_examples", "index", "installation/build-from-source-linux", "installation/grace-hopper", "installation/linux", "key-features", "llm-api/index", "llm-api/reference", "overview", "performance/perf-analysis", "performance/perf-benchmarking", "performance/perf-overview", "performance/performance-tuning-guide/benchmarking-default-performance", "performance/performance-tuning-guide/deciding-model-sharding-strategy", "performance/performance-tuning-guide/fp8-quantization", "performance/performance-tuning-guide/index", "performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens", "performance/performance-tuning-guide/useful-build-time-flags", "performance/performance-tuning-guide/useful-runtime-flags", "python-api/tensorrt_llm.functional", "python-api/tensorrt_llm.layers", "python-api/tensorrt_llm.models", "python-api/tensorrt_llm.plugin", "python-api/tensorrt_llm.quantization", "python-api/tensorrt_llm.runtime", "quick-start-guide", "reference/memory", "reference/precision", "reference/support-matrix", "reference/troubleshooting", "release-notes", "torch", "torch/adding_new_model", "torch/arch_overview", "torch/attention", "torch/kv_cache_manager", "torch/scheduler"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1}, "filenames": ["_cpp_gen/executor.rst", "_cpp_gen/runtime.rst", "advanced/disaggregated-service.md", "advanced/executor.md", "advanced/expert-parallelism.md", "advanced/gpt-attention.md", "advanced/gpt-runtime.md", "advanced/graph-rewriting.md", "advanced/kv-cache-reuse.md", "advanced/lora.md", "advanced/speculative-decoding.md", "advanced/weight-streaming.md", "architecture/add-model.md", "architecture/checkpoint.md", "architecture/core-concepts.md", "architecture/model-weights-loader.md", "architecture/overview.md", "architecture/workflow.md", "blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md", "blogs/Falcon180B-H200.md", "blogs/H100vsA100.md", "blogs/H200launch.md", "blogs/XQA-kernel.md", "blogs/quantization-in-TRT-LLM.md", "blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md", "commands/trtllm-build.rst", "commands/trtllm-serve.rst", "dev-on-cloud/build-image-to-dockerhub.md", "dev-on-cloud/dev-on-runpod.md", "examples/curl_chat_client.rst", "examples/curl_chat_client_for_multimodal.rst", "examples/curl_completion_client.rst", "examples/customization.md", "examples/deepseek_r1_reasoning_parser.rst", "examples/genai_perf_client.rst", "examples/genai_perf_client_for_multimodal.rst", "examples/index.rst", "examples/llm_api_examples.rst", "examples/llm_auto_parallel.rst", "examples/llm_eagle_decoding.rst", "examples/llm_guided_decoding.rst", "examples/llm_inference.rst", "examples/llm_inference_async.rst", "examples/llm_inference_async_streaming.rst", "examples/llm_inference_customize.rst", "examples/llm_inference_distributed.rst", "examples/llm_inference_kv_events.rst", "examples/llm_logits_processor.rst", "examples/llm_lookahead_decoding.rst", "examples/llm_medusa_decoding.rst", "examples/llm_mgmn_llm_distributed.rst", "examples/llm_mgmn_trtllm_bench.rst", "examples/llm_mgmn_trtllm_serve.rst", "examples/llm_multilora.rst", "examples/llm_quantization.rst", "examples/openai_chat_client.rst", "examples/openai_chat_client_for_multimodal.rst", "examples/openai_completion_client.rst", "examples/trtllm_serve_examples.rst", "index.rst", "installation/build-from-source-linux.md", "installation/grace-hopper.md", "installation/linux.md", "key-features.md", "llm-api/index.md", "llm-api/reference.rst", "overview.md", "performance/perf-analysis.md", "performance/perf-benchmarking.md", "performance/perf-overview.md", "performance/performance-tuning-guide/benchmarking-default-performance.md", "performance/performance-tuning-guide/deciding-model-sharding-strategy.md", "performance/performance-tuning-guide/fp8-quantization.md", "performance/performance-tuning-guide/index.rst", "performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.md", "performance/performance-tuning-guide/useful-build-time-flags.md", "performance/performance-tuning-guide/useful-runtime-flags.md", "python-api/tensorrt_llm.functional.rst", "python-api/tensorrt_llm.layers.rst", "python-api/tensorrt_llm.models.rst", "python-api/tensorrt_llm.plugin.rst", "python-api/tensorrt_llm.quantization.rst", "python-api/tensorrt_llm.runtime.rst", "quick-start-guide.md", "reference/memory.md", "reference/precision.md", "reference/support-matrix.md", "reference/troubleshooting.md", "release-notes.md", "torch.md", "torch/adding_new_model.md", "torch/arch_overview.md", "torch/attention.md", "torch/kv_cache_manager.md", "torch/scheduler.md"], "indexentries": {"--backend": [[26, "cmdoption-trtllm-serve-serve-backend", false]], "--cluster_size": [[26, "cmdoption-trtllm-serve-serve-cluster_size", false]], "--config_file": [[26, "cmdoption-trtllm-serve-disaggregated-c", false], [26, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", false]], "--ep_size": [[26, "cmdoption-trtllm-serve-serve-ep_size", false]], "--extra_llm_api_options": [[26, "cmdoption-trtllm-serve-serve-extra_llm_api_options", false]], "--gpus_per_node": [[26, "cmdoption-trtllm-serve-serve-gpus_per_node", false]], "--host": [[26, "cmdoption-trtllm-serve-serve-host", false]], "--kv_cache_free_gpu_memory_fraction": [[26, "cmdoption-trtllm-serve-serve-kv_cache_free_gpu_memory_fraction", false]], "--log_level": [[26, "cmdoption-trtllm-serve-disaggregated_mpi_worker-log_level", false], [26, "cmdoption-trtllm-serve-serve-log_level", false]], "--max_batch_size": [[26, "cmdoption-trtllm-serve-serve-max_batch_size", false]], "--max_beam_width": [[26, "cmdoption-trtllm-serve-serve-max_beam_width", false]], "--max_num_tokens": [[26, "cmdoption-trtllm-serve-serve-max_num_tokens", false]], "--max_seq_len": [[26, "cmdoption-trtllm-serve-serve-max_seq_len", false]], "--num_postprocess_workers": [[26, "cmdoption-trtllm-serve-serve-num_postprocess_workers", false]], "--port": [[26, "cmdoption-trtllm-serve-serve-port", false]], "--pp_size": [[26, "cmdoption-trtllm-serve-serve-pp_size", false]], "--reasoning_parser": [[26, "cmdoption-trtllm-serve-serve-reasoning_parser", false]], "--request_timeout": [[26, "cmdoption-trtllm-serve-disaggregated-r", false]], "--server_start_timeout": [[26, "cmdoption-trtllm-serve-disaggregated-t", false]], "--tokenizer": [[26, "cmdoption-trtllm-serve-serve-tokenizer", false]], "--tp_size": [[26, "cmdoption-trtllm-serve-serve-tp_size", false]], "--trust_remote_code": [[26, "cmdoption-trtllm-serve-serve-trust_remote_code", false]], "-c": [[26, "cmdoption-trtllm-serve-disaggregated-c", false], [26, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", false]], "-r": [[26, "cmdoption-trtllm-serve-disaggregated-r", false]], "-t": [[26, "cmdoption-trtllm-serve-disaggregated-t", false]], "__init__() (tensorrt_llm.llmapi.buildcacheconfig method)": [[65, "tensorrt_llm.llmapi.BuildCacheConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.buildconfig method)": [[65, "tensorrt_llm.llmapi.BuildConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.completionoutput method)": [[65, "tensorrt_llm.llmapi.CompletionOutput.__init__", false]], "__init__() (tensorrt_llm.llmapi.disaggregatedparams method)": [[65, "tensorrt_llm.llmapi.DisaggregatedParams.__init__", false]], "__init__() (tensorrt_llm.llmapi.guideddecodingparams method)": [[65, "tensorrt_llm.llmapi.GuidedDecodingParams.__init__", false]], "__init__() (tensorrt_llm.llmapi.kvcacheretentionconfig method)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig method)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.llm method)": [[65, "tensorrt_llm.llmapi.LLM.__init__", false]], "__init__() (tensorrt_llm.llmapi.lookaheaddecodingconfig method)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.mpicommsession method)": [[65, "tensorrt_llm.llmapi.MpiCommSession.__init__", false]], "__init__() (tensorrt_llm.llmapi.quantconfig method)": [[65, "tensorrt_llm.llmapi.QuantConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.requestoutput method)": [[65, "tensorrt_llm.llmapi.RequestOutput.__init__", false]], "__init__() (tensorrt_llm.llmapi.samplingparams method)": [[65, "tensorrt_llm.llmapi.SamplingParams.__init__", false]], "abort() (tensorrt_llm.llmapi.mpicommsession method)": [[65, "tensorrt_llm.llmapi.MpiCommSession.abort", false]], "abs() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.abs", false]], "abs() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.abs", false]], "activation() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.activation", false]], "adalayernorm (class in tensorrt_llm.layers.normalization)": [[78, "tensorrt_llm.layers.normalization.AdaLayerNorm", false]], "adalayernormcontinuous (class in tensorrt_llm.layers.normalization)": [[78, "tensorrt_llm.layers.normalization.AdaLayerNormContinuous", false]], "adalayernormzero (class in tensorrt_llm.layers.normalization)": [[78, "tensorrt_llm.layers.normalization.AdaLayerNormZero", false]], "adalayernormzerosingle (class in tensorrt_llm.layers.normalization)": [[78, "tensorrt_llm.layers.normalization.AdaLayerNormZeroSingle", false]], "add() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.add", false]], "add_input() (tensorrt_llm.functional.conditional method)": [[77, "tensorrt_llm.functional.Conditional.add_input", false]], "add_output() (tensorrt_llm.functional.conditional method)": [[77, "tensorrt_llm.functional.Conditional.add_output", false]], "add_sequence() (tensorrt_llm.runtime.kvcachemanager method)": [[82, "tensorrt_llm.runtime.KVCacheManager.add_sequence", false]], "add_special_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.add_special_tokens", false]], "additional_model_outputs (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.additional_model_outputs", false]], "alibi (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.alibi", false]], "alibi_with_scale (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.alibi_with_scale", false]], "allgather() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.allgather", false]], "allreduce() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.allreduce", false]], "allreducefusionop (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.AllReduceFusionOp", false]], "allreduceparams (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.AllReduceParams", false]], "allreducestrategy (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.AllReduceStrategy", false]], "apply_batched_logits_processor (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.apply_batched_logits_processor", false]], "apply_llama3_scaling() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.apply_llama3_scaling", false]], "apply_rotary_pos_emb() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.apply_rotary_pos_emb", false]], "apply_rotary_pos_emb_chatglm() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.apply_rotary_pos_emb_chatglm", false]], "apply_rotary_pos_emb_cogvlm() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.apply_rotary_pos_emb_cogvlm", false]], "arange() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.arange", false]], "argmax() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.argmax", false]], "assert_valid_quant_algo() (tensorrt_llm.models.gemmaforcausallm class method)": [[79, "tensorrt_llm.models.GemmaForCausalLM.assert_valid_quant_algo", false]], "assertion() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.assertion", false]], "attention (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.Attention", false]], "attentionmaskparams (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.AttentionMaskParams", false]], "attentionmasktype (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.AttentionMaskType", false]], "attentionparams (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.AttentionParams", false]], "attn_processors (tensorrt_llm.models.sd3transformer2dmodel property)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.attn_processors", false]], "audio_engine_dir (tensorrt_llm.runtime.multimodalmodelrunner property)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.audio_engine_dir", false]], "auto (tensorrt_llm.functional.allreducestrategy attribute)": [[77, "tensorrt_llm.functional.AllReduceStrategy.AUTO", false]], "auto_parallel_config (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.auto_parallel_config", false]], "avg_pool2d() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.avg_pool2d", false]], "avgpool2d (class in tensorrt_llm.layers.pooling)": [[78, "tensorrt_llm.layers.pooling.AvgPool2d", false]], "axes (tensorrt_llm.functional.sliceinputtype attribute)": [[77, "tensorrt_llm.functional.SliceInputType.axes", false]], "bad (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.bad", false]], "bad_token_ids (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.bad_token_ids", false]], "bad_words_list (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.bad_words_list", false]], "baichuanforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.BaichuanForCausalLM", false]], "batch_size (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.batch_size", false]], "batchingtype (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.BatchingType", false]], "beam_search_diversity_rate (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.beam_search_diversity_rate", false]], "beam_search_diversity_rate (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.beam_search_diversity_rate", false]], "beam_width_array (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.beam_width_array", false]], "bert_attention() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.bert_attention", false]], "bertattention (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.BertAttention", false]], "bertforquestionanswering (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.BertForQuestionAnswering", false]], "bertforsequenceclassification (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.BertForSequenceClassification", false]], "bertmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.BertModel", false]], "best_of (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.best_of", false]], "bidirectional (tensorrt_llm.functional.attentionmasktype attribute)": [[77, "tensorrt_llm.functional.AttentionMaskType.bidirectional", false]], "bidirectionalglm (tensorrt_llm.functional.attentionmasktype attribute)": [[77, "tensorrt_llm.functional.AttentionMaskType.bidirectionalglm", false]], "blocksparse (tensorrt_llm.functional.attentionmasktype attribute)": [[77, "tensorrt_llm.functional.AttentionMaskType.blocksparse", false]], "blocksparseattnparams (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.BlockSparseAttnParams", false]], "bloomforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.BloomForCausalLM", false]], "bloommodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.BloomModel", false]], "broadcast_helper() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.broadcast_helper", false]], "buffer_allocated (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.buffer_allocated", false]], "buildcacheconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.BuildCacheConfig", false]], "buildconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.BuildConfig", false]], "cache_root (tensorrt_llm.llmapi.buildcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildCacheConfig.cache_root", false]], "cache_root (tensorrt_llm.llmapi.buildcacheconfig property)": [[65, "id7", false]], "cachetransceiverconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.CacheTransceiverConfig", false]], "calculate_speculative_resource() (tensorrt_llm.llmapi.lookaheaddecodingconfig method)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig.calculate_speculative_resource", false]], "calib_batch_size (tensorrt_llm.llmapi.calibconfig attribute)": [[65, "tensorrt_llm.llmapi.CalibConfig.calib_batch_size", false]], "calib_batches (tensorrt_llm.llmapi.calibconfig attribute)": [[65, "tensorrt_llm.llmapi.CalibConfig.calib_batches", false]], "calib_dataset (tensorrt_llm.llmapi.calibconfig attribute)": [[65, "tensorrt_llm.llmapi.CalibConfig.calib_dataset", false]], "calib_max_seq_length (tensorrt_llm.llmapi.calibconfig attribute)": [[65, "tensorrt_llm.llmapi.CalibConfig.calib_max_seq_length", false]], "calibconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.CalibConfig", false]], "capacity_scheduler_policy (tensorrt_llm.llmapi.schedulerconfig attribute)": [[65, "tensorrt_llm.llmapi.SchedulerConfig.capacity_scheduler_policy", false]], "capacityschedulerpolicy (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.CapacitySchedulerPolicy", false]], "cast (class in tensorrt_llm.layers.cast)": [[78, "tensorrt_llm.layers.cast.Cast", false]], "cast() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.cast", false]], "cast() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.cast", false]], "categorical_sample() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.categorical_sample", false]], "causal (tensorrt_llm.functional.attentionmasktype attribute)": [[77, "tensorrt_llm.functional.AttentionMaskType.causal", false]], "chatglm (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.chatglm", false]], "chatglmconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.ChatGLMConfig", false]], "chatglmforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.ChatGLMForCausalLM", false]], "chatglmgenerationsession (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.ChatGLMGenerationSession", false]], "chatglmmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.ChatGLMModel", false]], "check_config() (tensorrt_llm.models.decodermodel method)": [[79, "tensorrt_llm.models.DecoderModel.check_config", false]], "check_config() (tensorrt_llm.models.dit method)": [[79, "tensorrt_llm.models.DiT.check_config", false]], "check_config() (tensorrt_llm.models.encodermodel method)": [[79, "tensorrt_llm.models.EncoderModel.check_config", false]], "check_config() (tensorrt_llm.models.falconforcausallm method)": [[79, "tensorrt_llm.models.FalconForCausalLM.check_config", false]], "check_config() (tensorrt_llm.models.mptforcausallm method)": [[79, "tensorrt_llm.models.MPTForCausalLM.check_config", false]], "check_config() (tensorrt_llm.models.optforcausallm method)": [[79, "tensorrt_llm.models.OPTForCausalLM.check_config", false]], "check_config() (tensorrt_llm.models.phiforcausallm method)": [[79, "tensorrt_llm.models.PhiForCausalLM.check_config", false]], "check_config() (tensorrt_llm.models.pretrainedmodel method)": [[79, "tensorrt_llm.models.PretrainedModel.check_config", false]], "choices() (tensorrt_llm.functional.positionembeddingtype static method)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.choices", false]], "chunk() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.chunk", false]], "clamp_val (tensorrt_llm.llmapi.quantconfig attribute)": [[65, "tensorrt_llm.llmapi.QuantConfig.clamp_val", false]], "clip() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.clip", false]], "clipvisiontransformer (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.CLIPVisionTransformer", false]], "cogvlmattention (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.CogVLMAttention", false]], "cogvlmconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.CogVLMConfig", false]], "cogvlmforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.CogVLMForCausalLM", false]], "cohereforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.CohereForCausalLM", false]], "collect_and_bias() (tensorrt_llm.layers.linear.linear method)": [[78, "tensorrt_llm.layers.linear.Linear.collect_and_bias", false]], "collect_and_bias() (tensorrt_llm.layers.linear.linearbase method)": [[78, "tensorrt_llm.layers.linear.LinearBase.collect_and_bias", false]], "collect_and_bias() (tensorrt_llm.layers.linear.rowlinear method)": [[78, "tensorrt_llm.layers.linear.RowLinear.collect_and_bias", false]], "columnlinear (in module tensorrt_llm.layers.linear)": [[78, "tensorrt_llm.layers.linear.ColumnLinear", false]], "combinedtimesteplabelembeddings (class in tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.CombinedTimestepLabelEmbeddings", false]], "combinedtimesteptextprojembeddings (class in tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.CombinedTimestepTextProjEmbeddings", false]], "completionoutput (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.CompletionOutput", false]], "compute_relative_bias() (in module tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.compute_relative_bias", false]], "concat() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.concat", false]], "conditional (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.Conditional", false]], "config_class (tensorrt_llm.models.baichuanforcausallm attribute)": [[79, "tensorrt_llm.models.BaichuanForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.chatglmforcausallm attribute)": [[79, "tensorrt_llm.models.ChatGLMForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.cogvlmforcausallm attribute)": [[79, "tensorrt_llm.models.CogVLMForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.cohereforcausallm attribute)": [[79, "tensorrt_llm.models.CohereForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.dbrxforcausallm attribute)": [[79, "tensorrt_llm.models.DbrxForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.deepseekforcausallm attribute)": [[79, "tensorrt_llm.models.DeepseekForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.deepseekv2forcausallm attribute)": [[79, "tensorrt_llm.models.DeepseekV2ForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.eagleforcausallm attribute)": [[79, "tensorrt_llm.models.EagleForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.falconforcausallm attribute)": [[79, "tensorrt_llm.models.FalconForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.gemmaforcausallm attribute)": [[79, "tensorrt_llm.models.GemmaForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.gptforcausallm attribute)": [[79, "tensorrt_llm.models.GPTForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.gptjforcausallm attribute)": [[79, "tensorrt_llm.models.GPTJForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.llamaforcausallm attribute)": [[79, "tensorrt_llm.models.LLaMAForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.mambaforcausallm attribute)": [[79, "tensorrt_llm.models.MambaForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.medusaforcausallm attribute)": [[79, "tensorrt_llm.models.MedusaForCausalLm.config_class", false]], "config_class (tensorrt_llm.models.mllamaforcausallm attribute)": [[79, "tensorrt_llm.models.MLLaMAForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.phi3forcausallm attribute)": [[79, "tensorrt_llm.models.Phi3ForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.phiforcausallm attribute)": [[79, "tensorrt_llm.models.PhiForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.sd3transformer2dmodel attribute)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.config_class", false]], "constant() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.constant", false]], "constant_to_tensor_() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.constant_to_tensor_", false]], "constants_to_tensors_() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.constants_to_tensors_", false]], "context (tensorrt_llm.runtime.session property)": [[82, "tensorrt_llm.runtime.Session.context", false]], "context_chunking_policy (tensorrt_llm.llmapi.schedulerconfig attribute)": [[65, "tensorrt_llm.llmapi.SchedulerConfig.context_chunking_policy", false]], "context_logits (tensorrt_llm.llmapi.requestoutput attribute)": [[65, "tensorrt_llm.llmapi.RequestOutput.context_logits", false]], "context_mem_size (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.context_mem_size", false]], "context_mem_size (tensorrt_llm.runtime.session property)": [[82, "tensorrt_llm.runtime.Session.context_mem_size", false]], "contextchunkingpolicy (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.ContextChunkingPolicy", false]], "conv1d (class in tensorrt_llm.layers.conv)": [[78, "tensorrt_llm.layers.conv.Conv1d", false]], "conv1d() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.conv1d", false]], "conv2d (class in tensorrt_llm.layers.conv)": [[78, "tensorrt_llm.layers.conv.Conv2d", false]], "conv2d() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.conv2d", false]], "conv3d (class in tensorrt_llm.layers.conv)": [[78, "tensorrt_llm.layers.conv.Conv3d", false]], "conv3d() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.conv3d", false]], "conv_kernel (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.conv_kernel", false]], "conv_kernel (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.conv_kernel", false]], "conv_transpose2d() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.conv_transpose2d", false]], "convtranspose2d (class in tensorrt_llm.layers.conv)": [[78, "tensorrt_llm.layers.conv.ConvTranspose2d", false]], "copy_on_partial_reuse (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.copy_on_partial_reuse", false]], "cos() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.cos", false]], "cp_split_plugin() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.cp_split_plugin", false]], "cpp_e2e (tensorrt_llm.runtime.multimodalmodelrunner property)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.cpp_e2e", false]], "cpp_llm_only (tensorrt_llm.runtime.multimodalmodelrunner property)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.cpp_llm_only", false]], "create_allreduce_plugin() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.create_allreduce_plugin", false]], "create_attention_const_params() (tensorrt_llm.layers.attention.attention static method)": [[78, "tensorrt_llm.layers.attention.Attention.create_attention_const_params", false]], "create_fake_weight() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.create_fake_weight", false]], "create_runtime_defaults() (tensorrt_llm.models.pretrainedconfig static method)": [[79, "tensorrt_llm.models.PretrainedConfig.create_runtime_defaults", false]], "create_sinusoidal_positions() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions", false]], "create_sinusoidal_positions_for_attention_plugin() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin", false]], "create_sinusoidal_positions_for_cogvlm_attention_plugin() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions_for_cogvlm_attention_plugin", false]], "create_sinusoidal_positions_long_rope() (tensorrt_llm.functional.ropeembeddingutils method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions_long_rope", false]], "create_sinusoidal_positions_yarn() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions_yarn", false]], "cropped_pos_embed() (tensorrt_llm.layers.embedding.sd3patchembed method)": [[78, "tensorrt_llm.layers.embedding.SD3PatchEmbed.cropped_pos_embed", false]], "cross_attention (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.cross_attention", false]], "cross_attention (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.cross_attention", false]], "cross_kv_cache_fraction (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.cross_kv_cache_fraction", false]], "ctx_request_id (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[65, "tensorrt_llm.llmapi.DisaggregatedParams.ctx_request_id", false]], "cuda_graph_cache_size (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[65, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_cache_size", false]], "cuda_graph_mode (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[65, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_mode", false]], "cuda_graph_mode (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.cuda_graph_mode", false]], "cuda_stream_guard() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.cuda_stream_guard", false]], "cuda_stream_sync() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.cuda_stream_sync", false]], "cumsum() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.cumsum", false]], "cumulative_logprob (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.cumulative_logprob", false]], "custom_mask (tensorrt_llm.functional.attentionmasktype attribute)": [[77, "tensorrt_llm.functional.AttentionMaskType.custom_mask", false]], "data (tensorrt_llm.functional.sliceinputtype attribute)": [[77, "tensorrt_llm.functional.SliceInputType.data", false]], "dbrxconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.DbrxConfig", false]], "dbrxforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.DbrxForCausalLM", false]], "debug_mode (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.debug_mode", false]], "debug_tensors_to_save (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.debug_tensors_to_save", false]], "decode() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.decode", false]], "decode_batch() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.decode_batch", false]], "decode_duration_ms (tensorrt_llm.llmapi.kvcacheretentionconfig property)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_duration_ms", false]], "decode_regular() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.decode_regular", false]], "decode_retention_priority (tensorrt_llm.llmapi.kvcacheretentionconfig property)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_retention_priority", false]], "decode_stream() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.decode_stream", false]], "decode_words_list() (in module tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.decode_words_list", false]], "decodermodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.DecoderModel", false]], "decoding_type (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.decoding_type", false]], "decoding_type (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig.decoding_type", false]], "decoding_type (tensorrt_llm.llmapi.medusadecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MedusaDecodingConfig.decoding_type", false]], "decoding_type (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MTPDecodingConfig.decoding_type", false]], "deepseekforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.DeepseekForCausalLM", false]], "deepseekv2attention (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.DeepseekV2Attention", false]], "deepseekv2forcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.DeepseekV2ForCausalLM", false]], "default_plugin_config() (tensorrt_llm.models.cogvlmforcausallm method)": [[79, "tensorrt_llm.models.CogVLMForCausalLM.default_plugin_config", false]], "default_plugin_config() (tensorrt_llm.models.llamaforcausallm method)": [[79, "tensorrt_llm.models.LLaMAForCausalLM.default_plugin_config", false]], "deferred (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.deferred", false]], "detokenize (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.detokenize", false]], "device (tensorrt_llm.llmapi.calibconfig attribute)": [[65, "tensorrt_llm.llmapi.CalibConfig.device", false]], "device (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.device", false]], "diffusersattention (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.DiffusersAttention", false]], "dimrange (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.DimRange", false]], "disable (tensorrt_llm.functional.sidestreamidtype attribute)": [[77, "tensorrt_llm.functional.SideStreamIDType.disable", false]], "disable_forward_chunking() (tensorrt_llm.models.sd3transformer2dmodel method)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.disable_forward_chunking", false]], "disaggregated_params (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.disaggregated_params", false]], "disaggregatedparams (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.DisaggregatedParams", false]], "dit (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.DiT", false]], "div() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.div", false]], "dora_plugin() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.dora_plugin", false]], "draft_tokens (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[65, "tensorrt_llm.llmapi.DisaggregatedParams.draft_tokens", false]], "draft_tokens_external (tensorrt_llm.models.speculativedecodingmode attribute)": [[79, "tensorrt_llm.models.SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL", false]], "dry_run (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.dry_run", false]], "dtype (tensorrt_llm.functional.tensor property)": [[77, "tensorrt_llm.functional.Tensor.dtype", false]], "dtype (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.dtype", false]], "dtype (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.dtype", false]], "dtype (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.dtype", false]], "dtype (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.dtype", false]], "dtype (tensorrt_llm.runtime.tensorinfo attribute)": [[82, "tensorrt_llm.runtime.TensorInfo.dtype", false]], "dump_debug_buffers() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.dump_debug_buffers", false]], "duration_ms (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig property)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.duration_ms", false]], "dynamic (tensorrt_llm.functional.rotaryscalingtype attribute)": [[77, "tensorrt_llm.functional.RotaryScalingType.dynamic", false]], "dynamic_batch_config (tensorrt_llm.llmapi.schedulerconfig attribute)": [[65, "tensorrt_llm.llmapi.SchedulerConfig.dynamic_batch_config", false]], "dynamic_batch_moving_average_window (tensorrt_llm.llmapi.dynamicbatchconfig attribute)": [[65, "tensorrt_llm.llmapi.DynamicBatchConfig.dynamic_batch_moving_average_window", false]], "dynamic_tree_max_topk (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.dynamic_tree_max_topK", false]], "dynamicbatchconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.DynamicBatchConfig", false]], "eagle (tensorrt_llm.models.speculativedecodingmode attribute)": [[79, "tensorrt_llm.models.SpeculativeDecodingMode.EAGLE", false]], "eagle_choices (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.eagle_choices", false]], "eagledecodingconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig", false]], "eagleforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.EagleForCausalLM", false]], "early_stop_criteria() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.early_stop_criteria", false]], "early_stopping (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.early_stopping", false]], "early_stopping (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.early_stopping", false]], "einsum() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.einsum", false]], "elementwise_binary() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.elementwise_binary", false]], "embedding (class in tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.Embedding", false]], "embedding() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.embedding", false]], "embedding_bias (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.embedding_bias", false]], "enable_batch_size_tuning (tensorrt_llm.llmapi.dynamicbatchconfig attribute)": [[65, "tensorrt_llm.llmapi.DynamicBatchConfig.enable_batch_size_tuning", false]], "enable_block_reuse (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.enable_block_reuse", false]], "enable_context_fmha_fp32_acc (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[65, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.enable_context_fmha_fp32_acc", false]], "enable_debug_output (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.enable_debug_output", false]], "enable_forward_chunking() (tensorrt_llm.models.sd3transformer2dmodel method)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.enable_forward_chunking", false]], "enable_max_num_tokens_tuning (tensorrt_llm.llmapi.dynamicbatchconfig attribute)": [[65, "tensorrt_llm.llmapi.DynamicBatchConfig.enable_max_num_tokens_tuning", false]], "enable_partial_reuse (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.enable_partial_reuse", false]], "encdecmodelrunner (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.EncDecModelRunner", false]], "encoder_run() (tensorrt_llm.runtime.encdecmodelrunner method)": [[82, "tensorrt_llm.runtime.EncDecModelRunner.encoder_run", false]], "encodermodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.EncoderModel", false]], "end_id (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.end_id", false]], "end_id (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.end_id", false]], "engine (tensorrt_llm.runtime.session property)": [[82, "tensorrt_llm.runtime.Session.engine", false]], "engine_inspector (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.engine_inspector", false]], "eq() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.eq", false]], "equal_progress (tensorrt_llm.llmapi.contextchunkingpolicy attribute)": [[65, "tensorrt_llm.llmapi.ContextChunkingPolicy.EQUAL_PROGRESS", false]], "event_buffer_max_size (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.event_buffer_max_size", false]], "exclude_input_from_output (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.exclude_input_from_output", false]], "exclude_modules (tensorrt_llm.llmapi.quantconfig attribute)": [[65, "tensorrt_llm.llmapi.QuantConfig.exclude_modules", false]], "exp() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.exp", false]], "expand() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.expand", false]], "expand_dims() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.expand_dims", false]], "expand_dims_like() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.expand_dims_like", false]], "expand_mask() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.expand_mask", false]], "explicit_draft_tokens (tensorrt_llm.models.speculativedecodingmode attribute)": [[79, "tensorrt_llm.models.SpeculativeDecodingMode.EXPLICIT_DRAFT_TOKENS", false]], "extendedruntimeperfknobconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig", false]], "falconconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.FalconConfig", false]], "falconforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.FalconForCausalLM", false]], "falconmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.FalconModel", false]], "fc_gate() (tensorrt_llm.layers.mlp.fusedgatedmlp method)": [[78, "tensorrt_llm.layers.mlp.FusedGatedMLP.fc_gate", false]], "fc_gate_dora() (in module tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.fc_gate_dora", false]], "fc_gate_lora() (in module tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.fc_gate_lora", false]], "fc_gate_plugin() (tensorrt_llm.layers.mlp.fusedgatedmlp method)": [[78, "tensorrt_llm.layers.mlp.FusedGatedMLP.fc_gate_plugin", false]], "fill_attention_const_params_for_long_rope() (tensorrt_llm.layers.attention.attentionparams method)": [[78, "tensorrt_llm.layers.attention.AttentionParams.fill_attention_const_params_for_long_rope", false]], "fill_attention_const_params_for_rope() (tensorrt_llm.layers.attention.attentionparams method)": [[78, "tensorrt_llm.layers.attention.AttentionParams.fill_attention_const_params_for_rope", false]], "fill_attention_params() (tensorrt_llm.layers.attention.attention static method)": [[78, "tensorrt_llm.layers.attention.Attention.fill_attention_params", false]], "fill_none_tensor_list() (tensorrt_llm.layers.attention.keyvaluecacheparams method)": [[78, "tensorrt_llm.layers.attention.KeyValueCacheParams.fill_none_tensor_list", false]], "fill_value (tensorrt_llm.functional.sliceinputtype attribute)": [[77, "tensorrt_llm.functional.SliceInputType.fill_value", false]], "filter_medusa_logits() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.filter_medusa_logits", false]], "finalize_decoder() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.finalize_decoder", false]], "find_best_medusa_path() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.find_best_medusa_path", false]], "finish_reason (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.finish_reason", false]], "finished (tensorrt_llm.llmapi.requestoutput attribute)": [[65, "tensorrt_llm.llmapi.RequestOutput.finished", false]], "first_come_first_served (tensorrt_llm.llmapi.contextchunkingpolicy attribute)": [[65, "tensorrt_llm.llmapi.ContextChunkingPolicy.FIRST_COME_FIRST_SERVED", false]], "first_gen_tokens (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[65, "tensorrt_llm.llmapi.DisaggregatedParams.first_gen_tokens", false]], "first_layer (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.first_layer", false]], "flatten() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.flatten", false]], "flatten() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.flatten", false]], "flip() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.flip", false]], "floordiv() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.floordiv", false]], "fmt_dim (c macro)": [[1, "c.FMT_DIM", false]], "for_each_rank() (tensorrt_llm.models.pretrainedconfig method)": [[79, "tensorrt_llm.models.PretrainedConfig.for_each_rank", false]], "force_num_profiles (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.force_num_profiles", false]], "forward() (tensorrt_llm.layers.activation.mish method)": [[78, "tensorrt_llm.layers.activation.Mish.forward", false]], "forward() (tensorrt_llm.layers.attention.attention method)": [[78, "tensorrt_llm.layers.attention.Attention.forward", false]], "forward() (tensorrt_llm.layers.attention.bertattention method)": [[78, "tensorrt_llm.layers.attention.BertAttention.forward", false]], "forward() (tensorrt_llm.layers.attention.cogvlmattention method)": [[78, "tensorrt_llm.layers.attention.CogVLMAttention.forward", false]], "forward() (tensorrt_llm.layers.attention.deepseekv2attention method)": [[78, "tensorrt_llm.layers.attention.DeepseekV2Attention.forward", false]], "forward() (tensorrt_llm.layers.attention.diffusersattention method)": [[78, "tensorrt_llm.layers.attention.DiffusersAttention.forward", false]], "forward() (tensorrt_llm.layers.cast.cast method)": [[78, "tensorrt_llm.layers.cast.Cast.forward", false]], "forward() (tensorrt_llm.layers.conv.conv1d method)": [[78, "tensorrt_llm.layers.conv.Conv1d.forward", false]], "forward() (tensorrt_llm.layers.conv.conv2d method)": [[78, "tensorrt_llm.layers.conv.Conv2d.forward", false]], "forward() (tensorrt_llm.layers.conv.conv3d method)": [[78, "tensorrt_llm.layers.conv.Conv3d.forward", false]], "forward() (tensorrt_llm.layers.conv.convtranspose2d method)": [[78, "tensorrt_llm.layers.conv.ConvTranspose2d.forward", false]], "forward() (tensorrt_llm.layers.embedding.combinedtimesteplabelembeddings method)": [[78, "tensorrt_llm.layers.embedding.CombinedTimestepLabelEmbeddings.forward", false]], "forward() (tensorrt_llm.layers.embedding.combinedtimesteptextprojembeddings method)": [[78, "tensorrt_llm.layers.embedding.CombinedTimestepTextProjEmbeddings.forward", false]], "forward() (tensorrt_llm.layers.embedding.embedding method)": [[78, "tensorrt_llm.layers.embedding.Embedding.forward", false]], "forward() (tensorrt_llm.layers.embedding.labelembedding method)": [[78, "tensorrt_llm.layers.embedding.LabelEmbedding.forward", false]], "forward() (tensorrt_llm.layers.embedding.pixartalphatextprojection method)": [[78, "tensorrt_llm.layers.embedding.PixArtAlphaTextProjection.forward", false]], "forward() (tensorrt_llm.layers.embedding.prompttuningembedding method)": [[78, "tensorrt_llm.layers.embedding.PromptTuningEmbedding.forward", false]], "forward() (tensorrt_llm.layers.embedding.sd3patchembed method)": [[78, "tensorrt_llm.layers.embedding.SD3PatchEmbed.forward", false]], "forward() (tensorrt_llm.layers.embedding.timestepembedding method)": [[78, "tensorrt_llm.layers.embedding.TimestepEmbedding.forward", false]], "forward() (tensorrt_llm.layers.embedding.timesteps method)": [[78, "tensorrt_llm.layers.embedding.Timesteps.forward", false]], "forward() (tensorrt_llm.layers.linear.linearbase method)": [[78, "tensorrt_llm.layers.linear.LinearBase.forward", false]], "forward() (tensorrt_llm.layers.mlp.fusedgatedmlp method)": [[78, "tensorrt_llm.layers.mlp.FusedGatedMLP.forward", false]], "forward() (tensorrt_llm.layers.mlp.gatedmlp method)": [[78, "tensorrt_llm.layers.mlp.GatedMLP.forward", false]], "forward() (tensorrt_llm.layers.mlp.linearactivation method)": [[78, "tensorrt_llm.layers.mlp.LinearActivation.forward", false]], "forward() (tensorrt_llm.layers.mlp.linearapproximategelu method)": [[78, "tensorrt_llm.layers.mlp.LinearApproximateGELU.forward", false]], "forward() (tensorrt_llm.layers.mlp.lineargeglu method)": [[78, "tensorrt_llm.layers.mlp.LinearGEGLU.forward", false]], "forward() (tensorrt_llm.layers.mlp.lineargelu method)": [[78, "tensorrt_llm.layers.mlp.LinearGELU.forward", false]], "forward() (tensorrt_llm.layers.mlp.linearswiglu method)": [[78, "tensorrt_llm.layers.mlp.LinearSwiGLU.forward", false]], "forward() (tensorrt_llm.layers.mlp.mlp method)": [[78, "tensorrt_llm.layers.mlp.MLP.forward", false]], "forward() (tensorrt_llm.layers.normalization.adalayernorm method)": [[78, "tensorrt_llm.layers.normalization.AdaLayerNorm.forward", false]], "forward() (tensorrt_llm.layers.normalization.adalayernormcontinuous method)": [[78, "tensorrt_llm.layers.normalization.AdaLayerNormContinuous.forward", false]], "forward() (tensorrt_llm.layers.normalization.adalayernormzero method)": [[78, "tensorrt_llm.layers.normalization.AdaLayerNormZero.forward", false]], "forward() (tensorrt_llm.layers.normalization.adalayernormzerosingle method)": [[78, "tensorrt_llm.layers.normalization.AdaLayerNormZeroSingle.forward", false]], "forward() (tensorrt_llm.layers.normalization.groupnorm method)": [[78, "tensorrt_llm.layers.normalization.GroupNorm.forward", false]], "forward() (tensorrt_llm.layers.normalization.layernorm method)": [[78, "tensorrt_llm.layers.normalization.LayerNorm.forward", false]], "forward() (tensorrt_llm.layers.normalization.rmsnorm method)": [[78, "tensorrt_llm.layers.normalization.RmsNorm.forward", false]], "forward() (tensorrt_llm.layers.normalization.sd35adalayernormzerox method)": [[78, "tensorrt_llm.layers.normalization.SD35AdaLayerNormZeroX.forward", false]], "forward() (tensorrt_llm.layers.pooling.avgpool2d method)": [[78, "tensorrt_llm.layers.pooling.AvgPool2d.forward", false]], "forward() (tensorrt_llm.models.bertforquestionanswering method)": [[79, "tensorrt_llm.models.BertForQuestionAnswering.forward", false]], "forward() (tensorrt_llm.models.bertforsequenceclassification method)": [[79, "tensorrt_llm.models.BertForSequenceClassification.forward", false]], "forward() (tensorrt_llm.models.bertmodel method)": [[79, "tensorrt_llm.models.BertModel.forward", false]], "forward() (tensorrt_llm.models.bloommodel method)": [[79, "tensorrt_llm.models.BloomModel.forward", false]], "forward() (tensorrt_llm.models.chatglmmodel method)": [[79, "tensorrt_llm.models.ChatGLMModel.forward", false]], "forward() (tensorrt_llm.models.clipvisiontransformer method)": [[79, "tensorrt_llm.models.CLIPVisionTransformer.forward", false]], "forward() (tensorrt_llm.models.decodermodel method)": [[79, "tensorrt_llm.models.DecoderModel.forward", false]], "forward() (tensorrt_llm.models.dit method)": [[79, "tensorrt_llm.models.DiT.forward", false]], "forward() (tensorrt_llm.models.eagleforcausallm method)": [[79, "tensorrt_llm.models.EagleForCausalLM.forward", false]], "forward() (tensorrt_llm.models.encodermodel method)": [[79, "tensorrt_llm.models.EncoderModel.forward", false]], "forward() (tensorrt_llm.models.falconmodel method)": [[79, "tensorrt_llm.models.FalconModel.forward", false]], "forward() (tensorrt_llm.models.gptjmodel method)": [[79, "tensorrt_llm.models.GPTJModel.forward", false]], "forward() (tensorrt_llm.models.gptmodel method)": [[79, "tensorrt_llm.models.GPTModel.forward", false]], "forward() (tensorrt_llm.models.gptneoxmodel method)": [[79, "tensorrt_llm.models.GPTNeoXModel.forward", false]], "forward() (tensorrt_llm.models.llamamodel method)": [[79, "tensorrt_llm.models.LLaMAModel.forward", false]], "forward() (tensorrt_llm.models.llavanextvisionwrapper method)": [[79, "tensorrt_llm.models.LlavaNextVisionWrapper.forward", false]], "forward() (tensorrt_llm.models.mambaforcausallm method)": [[79, "tensorrt_llm.models.MambaForCausalLM.forward", false]], "forward() (tensorrt_llm.models.mllamaforcausallm method)": [[79, "tensorrt_llm.models.MLLaMAForCausalLM.forward", false]], "forward() (tensorrt_llm.models.mptmodel method)": [[79, "tensorrt_llm.models.MPTModel.forward", false]], "forward() (tensorrt_llm.models.optmodel method)": [[79, "tensorrt_llm.models.OPTModel.forward", false]], "forward() (tensorrt_llm.models.phi3model method)": [[79, "tensorrt_llm.models.Phi3Model.forward", false]], "forward() (tensorrt_llm.models.phimodel method)": [[79, "tensorrt_llm.models.PhiModel.forward", false]], "forward() (tensorrt_llm.models.recurrentgemmaforcausallm method)": [[79, "tensorrt_llm.models.RecurrentGemmaForCausalLM.forward", false]], "forward() (tensorrt_llm.models.redrafterforcausallm method)": [[79, "tensorrt_llm.models.ReDrafterForCausalLM.forward", false]], "forward() (tensorrt_llm.models.sd3transformer2dmodel method)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.forward", false]], "forward() (tensorrt_llm.models.whisperencoder method)": [[79, "tensorrt_llm.models.WhisperEncoder.forward", false]], "forward_with_cfg() (tensorrt_llm.models.dit method)": [[79, "tensorrt_llm.models.DiT.forward_with_cfg", false]], "forward_without_cfg() (tensorrt_llm.models.dit method)": [[79, "tensorrt_llm.models.DiT.forward_without_cfg", false]], "fp8 (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.FP8", false]], "fp8_block_scales (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.FP8_BLOCK_SCALES", false]], "fp8_per_channel_per_token (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN", false]], "free_gpu_memory_fraction (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.free_gpu_memory_fraction", false]], "frequency_penalty (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.frequency_penalty", false]], "frequency_penalty (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.frequency_penalty", false]], "from_arguments() (tensorrt_llm.models.speculativedecodingmode static method)": [[79, "tensorrt_llm.models.SpeculativeDecodingMode.from_arguments", false]], "from_checkpoint() (tensorrt_llm.models.pretrainedconfig class method)": [[79, "tensorrt_llm.models.PretrainedConfig.from_checkpoint", false]], "from_checkpoint() (tensorrt_llm.models.pretrainedmodel class method)": [[79, "tensorrt_llm.models.PretrainedModel.from_checkpoint", false]], "from_config() (tensorrt_llm.models.pretrainedmodel class method)": [[79, "tensorrt_llm.models.PretrainedModel.from_config", false]], "from_dict() (tensorrt_llm.llmapi.buildconfig class method)": [[65, "tensorrt_llm.llmapi.BuildConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.calibconfig class method)": [[65, "tensorrt_llm.llmapi.CalibConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.eagledecodingconfig class method)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.lookaheaddecodingconfig class method)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.medusadecodingconfig class method)": [[65, "tensorrt_llm.llmapi.MedusaDecodingConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.mtpdecodingconfig class method)": [[65, "tensorrt_llm.llmapi.MTPDecodingConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.quantconfig class method)": [[65, "tensorrt_llm.llmapi.QuantConfig.from_dict", false]], "from_dict() (tensorrt_llm.models.pretrainedconfig class method)": [[79, "tensorrt_llm.models.PretrainedConfig.from_dict", false]], "from_dir() (tensorrt_llm.runtime.modelrunner class method)": [[82, "tensorrt_llm.runtime.ModelRunner.from_dir", false]], "from_dir() (tensorrt_llm.runtime.modelrunnercpp class method)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.from_dir", false]], "from_engine() (tensorrt_llm.runtime.encdecmodelrunner class method)": [[82, "tensorrt_llm.runtime.EncDecModelRunner.from_engine", false]], "from_engine() (tensorrt_llm.runtime.modelrunner class method)": [[82, "tensorrt_llm.runtime.ModelRunner.from_engine", false]], "from_engine() (tensorrt_llm.runtime.session static method)": [[82, "tensorrt_llm.runtime.Session.from_engine", false]], "from_hugging_face() (tensorrt_llm.models.baichuanforcausallm class method)": [[79, "tensorrt_llm.models.BaichuanForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.chatglmconfig class method)": [[79, "tensorrt_llm.models.ChatGLMConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.chatglmforcausallm class method)": [[79, "tensorrt_llm.models.ChatGLMForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.cogvlmforcausallm class method)": [[79, "tensorrt_llm.models.CogVLMForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.cohereforcausallm class method)": [[79, "tensorrt_llm.models.CohereForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.deepseekforcausallm class method)": [[79, "tensorrt_llm.models.DeepseekForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.deepseekv2forcausallm class method)": [[79, "tensorrt_llm.models.DeepseekV2ForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.eagleforcausallm class method)": [[79, "tensorrt_llm.models.EagleForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.falconconfig class method)": [[79, "tensorrt_llm.models.FalconConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.falconforcausallm class method)": [[79, "tensorrt_llm.models.FalconForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gemmaconfig class method)": [[79, "tensorrt_llm.models.GemmaConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gemmaforcausallm class method)": [[79, "tensorrt_llm.models.GemmaForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gptconfig class method)": [[79, "tensorrt_llm.models.GPTConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gptforcausallm class method)": [[79, "tensorrt_llm.models.GPTForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gptjconfig class method)": [[79, "tensorrt_llm.models.GPTJConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gptjforcausallm class method)": [[79, "tensorrt_llm.models.GPTJForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.llamaconfig class method)": [[79, "tensorrt_llm.models.LLaMAConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.llamaforcausallm class method)": [[79, "tensorrt_llm.models.LLaMAForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.llavanextvisionconfig class method)": [[79, "tensorrt_llm.models.LlavaNextVisionConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.llavanextvisionwrapper class method)": [[79, "tensorrt_llm.models.LlavaNextVisionWrapper.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.mambaforcausallm class method)": [[79, "tensorrt_llm.models.MambaForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.medusaconfig class method)": [[79, "tensorrt_llm.models.MedusaConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.medusaforcausallm class method)": [[79, "tensorrt_llm.models.MedusaForCausalLm.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.mllamaforcausallm class method)": [[79, "tensorrt_llm.models.MLLaMAForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.phi3forcausallm class method)": [[79, "tensorrt_llm.models.Phi3ForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.phiforcausallm class method)": [[79, "tensorrt_llm.models.PhiForCausalLM.from_hugging_face", false]], "from_json_file() (tensorrt_llm.llmapi.buildconfig class method)": [[65, "tensorrt_llm.llmapi.BuildConfig.from_json_file", false]], "from_json_file() (tensorrt_llm.models.pretrainedconfig class method)": [[79, "tensorrt_llm.models.PretrainedConfig.from_json_file", false]], "from_meta_ckpt() (tensorrt_llm.models.llamaconfig class method)": [[79, "tensorrt_llm.models.LLaMAConfig.from_meta_ckpt", false]], "from_meta_ckpt() (tensorrt_llm.models.llamaforcausallm class method)": [[79, "tensorrt_llm.models.LLaMAForCausalLM.from_meta_ckpt", false]], "from_nemo() (tensorrt_llm.models.gptconfig class method)": [[79, "tensorrt_llm.models.GPTConfig.from_nemo", false]], "from_nemo() (tensorrt_llm.models.gptforcausallm class method)": [[79, "tensorrt_llm.models.GPTForCausalLM.from_nemo", false]], "from_pretrained() (tensorrt_llm.models.sd3transformer2dmodel class method)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.from_pretrained", false]], "from_serialized_engine() (tensorrt_llm.runtime.session static method)": [[82, "tensorrt_llm.runtime.Session.from_serialized_engine", false]], "from_string() (tensorrt_llm.functional.positionembeddingtype static method)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.from_string", false]], "from_string() (tensorrt_llm.functional.rotaryscalingtype static method)": [[77, "tensorrt_llm.functional.RotaryScalingType.from_string", false]], "fuse_qkv_projections() (tensorrt_llm.models.sd3transformer2dmodel method)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.fuse_qkv_projections", false]], "fusedgatedmlp (class in tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.FusedGatedMLP", false]], "fusedgatedmlp (tensorrt_llm.functional.mlptype attribute)": [[77, "tensorrt_llm.functional.MLPType.FusedGatedMLP", false]], "gatedmlp (class in tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.GatedMLP", false]], "gatedmlp (tensorrt_llm.functional.mlptype attribute)": [[77, "tensorrt_llm.functional.MLPType.GatedMLP", false]], "gather() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.gather", false]], "gather_context_logits (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.gather_context_logits", false]], "gather_context_logits (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.gather_context_logits", false]], "gather_context_logits (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.gather_context_logits", false]], "gather_context_logits (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.gather_context_logits", false]], "gather_context_logits (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.gather_context_logits", false]], "gather_generation_logits (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.gather_generation_logits", false]], "gather_generation_logits (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.gather_generation_logits", false]], "gather_generation_logits (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.gather_generation_logits", false]], "gather_generation_logits (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.gather_generation_logits", false]], "gather_generation_logits (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.gather_generation_logits", false]], "gather_last_token_logits() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.gather_last_token_logits", false]], "gather_nd() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.gather_nd", false]], "gegelu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.gegelu", false]], "geglu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.geglu", false]], "gelu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.gelu", false]], "gemm_allreduce() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.gemm_allreduce", false]], "gemm_allreduce_plugin (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.gemm_allreduce_plugin", false]], "gemm_allreduce_plugin (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.gemm_allreduce_plugin", false]], "gemm_swiglu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.gemm_swiglu", false]], "gemma2_added_fields (tensorrt_llm.models.gemmaconfig attribute)": [[79, "tensorrt_llm.models.GemmaConfig.GEMMA2_ADDED_FIELDS", false]], "gemma2_config() (tensorrt_llm.models.gemmaconfig method)": [[79, "tensorrt_llm.models.GemmaConfig.gemma2_config", false]], "gemma3_added_fields (tensorrt_llm.models.gemmaconfig attribute)": [[79, "tensorrt_llm.models.GemmaConfig.GEMMA3_ADDED_FIELDS", false]], "gemma3_config() (tensorrt_llm.models.gemmaconfig method)": [[79, "tensorrt_llm.models.GemmaConfig.gemma3_config", false]], "gemma_added_fields (tensorrt_llm.models.gemmaconfig attribute)": [[79, "tensorrt_llm.models.GemmaConfig.GEMMA_ADDED_FIELDS", false]], "gemmaconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GemmaConfig", false]], "gemmaforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GemmaForCausalLM", false]], "generate() (tensorrt_llm.llmapi.llm method)": [[65, "tensorrt_llm.llmapi.LLM.generate", false]], "generate() (tensorrt_llm.runtime.encdecmodelrunner method)": [[82, "tensorrt_llm.runtime.EncDecModelRunner.generate", false]], "generate() (tensorrt_llm.runtime.modelrunner method)": [[82, "tensorrt_llm.runtime.ModelRunner.generate", false]], "generate() (tensorrt_llm.runtime.modelrunnercpp method)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.generate", false]], "generate() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.generate", false]], "generate() (tensorrt_llm.runtime.qwenforcausallmgenerationsession method)": [[82, "tensorrt_llm.runtime.QWenForCausalLMGenerationSession.generate", false]], "generate_alibi_biases() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.generate_alibi_biases", false]], "generate_alibi_slopes() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.generate_alibi_slopes", false]], "generate_async() (tensorrt_llm.llmapi.llm method)": [[65, "tensorrt_llm.llmapi.LLM.generate_async", false]], "generate_logn_scaling() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.generate_logn_scaling", false]], "generation_logits (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.generation_logits", false]], "generationsequence (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.GenerationSequence", false]], "generationsession (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.GenerationSession", false]], "get_1d_sincos_pos_embed_from_grid() (in module tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.get_1d_sincos_pos_embed_from_grid", false]], "get_2d_sincos_pos_embed() (in module tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.get_2d_sincos_pos_embed", false]], "get_2d_sincos_pos_embed_from_grid() (in module tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.get_2d_sincos_pos_embed_from_grid", false]], "get_audio_features() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.get_audio_features", false]], "get_batch_idx() (tensorrt_llm.runtime.generationsequence method)": [[82, "tensorrt_llm.runtime.GenerationSequence.get_batch_idx", false]], "get_block_offsets() (tensorrt_llm.runtime.kvcachemanager method)": [[82, "tensorrt_llm.runtime.KVCacheManager.get_block_offsets", false]], "get_comm() (tensorrt_llm.llmapi.mpicommsession method)": [[65, "tensorrt_llm.llmapi.MpiCommSession.get_comm", false]], "get_config_group() (tensorrt_llm.models.pretrainedconfig method)": [[79, "tensorrt_llm.models.PretrainedConfig.get_config_group", false]], "get_context_phase_params() (tensorrt_llm.llmapi.disaggregatedparams method)": [[65, "tensorrt_llm.llmapi.DisaggregatedParams.get_context_phase_params", false]], "get_first_past_key_value() (tensorrt_llm.layers.attention.keyvaluecacheparams method)": [[78, "tensorrt_llm.layers.attention.KeyValueCacheParams.get_first_past_key_value", false]], "get_hf_config() (tensorrt_llm.models.gemmaconfig static method)": [[79, "tensorrt_llm.models.GemmaConfig.get_hf_config", false]], "get_kv_cache_events() (tensorrt_llm.llmapi.llm method)": [[65, "tensorrt_llm.llmapi.LLM.get_kv_cache_events", false]], "get_kv_cache_events_async() (tensorrt_llm.llmapi.llm method)": [[65, "tensorrt_llm.llmapi.LLM.get_kv_cache_events_async", false]], "get_next_medusa_tokens() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.get_next_medusa_tokens", false]], "get_num_heads_kv() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.get_num_heads_kv", false]], "get_parent() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.get_parent", false]], "get_request_type() (tensorrt_llm.llmapi.disaggregatedparams method)": [[65, "tensorrt_llm.llmapi.DisaggregatedParams.get_request_type", false]], "get_rope_index() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.get_rope_index", false]], "get_seq_idx() (tensorrt_llm.runtime.generationsequence method)": [[82, "tensorrt_llm.runtime.GenerationSequence.get_seq_idx", false]], "get_stats() (tensorrt_llm.llmapi.llm method)": [[65, "tensorrt_llm.llmapi.LLM.get_stats", false]], "get_stats_async() (tensorrt_llm.llmapi.llm method)": [[65, "tensorrt_llm.llmapi.LLM.get_stats_async", false]], "get_timestep_embedding() (in module tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.get_timestep_embedding", false]], "get_users() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.get_users", false]], "get_visual_features() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.get_visual_features", false]], "get_weight() (tensorrt_llm.layers.linear.linearbase method)": [[78, "tensorrt_llm.layers.linear.LinearBase.get_weight", false]], "gpt_attention() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.gpt_attention", false]], "gpt_attention_plugin (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.gpt_attention_plugin", false]], "gptconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GPTConfig", false]], "gptforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GPTForCausalLM", false]], "gptjconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GPTJConfig", false]], "gptjforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GPTJForCausalLM", false]], "gptjmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GPTJModel", false]], "gptmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GPTModel", false]], "gptneoxforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GPTNeoXForCausalLM", false]], "gptneoxmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.GPTNeoXModel", false]], "gpu_weights_percent (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.gpu_weights_percent", false]], "grammar (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[65, "tensorrt_llm.llmapi.GuidedDecodingParams.grammar", false]], "greedy_sampling (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.greedy_sampling", false]], "group_norm() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.group_norm", false]], "group_size (tensorrt_llm.llmapi.quantconfig attribute)": [[65, "tensorrt_llm.llmapi.QuantConfig.group_size", false]], "groupnorm (class in tensorrt_llm.layers.normalization)": [[78, "tensorrt_llm.layers.normalization.GroupNorm", false]], "groupnorm (tensorrt_llm.functional.layernormtype attribute)": [[77, "tensorrt_llm.functional.LayerNormType.GroupNorm", false]], "gt() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.gt", false]], "guaranteed_no_evict (tensorrt_llm.llmapi.capacityschedulerpolicy attribute)": [[65, "tensorrt_llm.llmapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT", false]], "guided_decoding (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.guided_decoding", false]], "guideddecodingparams (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.GuidedDecodingParams", false]], "handle_per_step() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.handle_per_step", false]], "has_affine() (tensorrt_llm.functional.allreduceparams method)": [[77, "tensorrt_llm.functional.AllReduceParams.has_affine", false]], "has_bias() (tensorrt_llm.functional.allreduceparams method)": [[77, "tensorrt_llm.functional.AllReduceParams.has_bias", false]], "has_config_group() (tensorrt_llm.models.pretrainedconfig method)": [[79, "tensorrt_llm.models.PretrainedConfig.has_config_group", false]], "has_position_embedding (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.has_position_embedding", false]], "has_position_embedding (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.has_position_embedding", false]], "has_scale() (tensorrt_llm.functional.allreduceparams method)": [[77, "tensorrt_llm.functional.AllReduceParams.has_scale", false]], "has_token_type_embedding (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.has_token_type_embedding", false]], "has_token_type_embedding (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.has_token_type_embedding", false]], "has_zero_point (tensorrt_llm.llmapi.quantconfig attribute)": [[65, "tensorrt_llm.llmapi.QuantConfig.has_zero_point", false]], "head_size (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.head_size", false]], "head_size (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.head_size", false]], "hidden_size (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.hidden_size", false]], "hidden_size (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.hidden_size", false]], "hidden_size (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.hidden_size", false]], "hidden_size (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.hidden_size", false]], "host_cache_size (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.host_cache_size", false]], "identity() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.identity", false]], "ignore_eos (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.ignore_eos", false]], "include_stop_str_in_output (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.include_stop_str_in_output", false]], "index (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.index", false]], "index_select() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.index_select", false]], "infer_shapes() (tensorrt_llm.runtime.session method)": [[82, "tensorrt_llm.runtime.Session.infer_shapes", false]], "inflight (tensorrt_llm.llmapi.batchingtype attribute)": [[65, "tensorrt_llm.llmapi.BatchingType.INFLIGHT", false]], "init_audio_encoder() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.init_audio_encoder", false]], "init_image_encoder() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.init_image_encoder", false]], "init_llm() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.init_llm", false]], "init_processor() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.init_processor", false]], "init_tokenizer() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.init_tokenizer", false]], "input_timing_cache (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.input_timing_cache", false]], "int8 (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.INT8", false]], "int_clip() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.int_clip", false]], "interpolate() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.interpolate", false]], "is_alibi() (tensorrt_llm.functional.positionembeddingtype method)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.is_alibi", false]], "is_deferred() (tensorrt_llm.functional.positionembeddingtype method)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.is_deferred", false]], "is_dynamic() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.is_dynamic", false]], "is_gated_activation() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.is_gated_activation", false]], "is_gemma_2 (tensorrt_llm.models.gemmaconfig property)": [[79, "tensorrt_llm.models.GemmaConfig.is_gemma_2", false]], "is_gemma_3 (tensorrt_llm.models.gemmaconfig property)": [[79, "tensorrt_llm.models.GemmaConfig.is_gemma_3", false]], "is_medusa_mode (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.is_medusa_mode", false]], "is_module_excluded_from_quantization() (tensorrt_llm.llmapi.quantconfig method)": [[65, "tensorrt_llm.llmapi.QuantConfig.is_module_excluded_from_quantization", false]], "is_mrope() (tensorrt_llm.functional.positionembeddingtype method)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.is_mrope", false]], "is_redrafter_mode (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.is_redrafter_mode", false]], "is_rope() (tensorrt_llm.functional.positionembeddingtype method)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.is_rope", false]], "is_trt_wrapper() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.is_trt_wrapper", false]], "is_valid() (tensorrt_llm.layers.attention.attentionparams method)": [[78, "tensorrt_llm.layers.attention.AttentionParams.is_valid", false]], "is_valid() (tensorrt_llm.layers.attention.keyvaluecacheparams method)": [[78, "tensorrt_llm.layers.attention.KeyValueCacheParams.is_valid", false]], "is_valid_cross_attn() (tensorrt_llm.layers.attention.attentionparams method)": [[78, "tensorrt_llm.layers.attention.AttentionParams.is_valid_cross_attn", false]], "joint_attn_forward() (tensorrt_llm.layers.attention.diffusersattention method)": [[78, "tensorrt_llm.layers.attention.DiffusersAttention.joint_attn_forward", false]], "json (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[65, "tensorrt_llm.llmapi.GuidedDecodingParams.json", false]], "json_object (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[65, "tensorrt_llm.llmapi.GuidedDecodingParams.json_object", false]], "keyvaluecacheparams (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.KeyValueCacheParams", false]], "kv_cache_quant_algo (tensorrt_llm.llmapi.quantconfig attribute)": [[65, "tensorrt_llm.llmapi.QuantConfig.kv_cache_quant_algo", false]], "kv_cache_type (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.kv_cache_type", false]], "kv_cache_type (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.kv_cache_type", false]], "kv_cache_type (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.kv_cache_type", false]], "kv_dtype (tensorrt_llm.models.pretrainedconfig property)": [[79, "tensorrt_llm.models.PretrainedConfig.kv_dtype", false]], "kvcacheconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.KvCacheConfig", false]], "kvcachemanager (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.KVCacheManager", false]], "kvcacheretentionconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig", false]], "kvcacheretentionconfig.tokenrangeretentionconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig", false]], "labelembedding (class in tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.LabelEmbedding", false]], "language_adapter_config (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.language_adapter_config", false]], "last_layer (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.last_layer", false]], "last_process_for_ub (tensorrt_llm.functional.allreducefusionop attribute)": [[77, "tensorrt_llm.functional.AllReduceFusionOp.LAST_PROCESS_FOR_UB", false]], "layer_norm() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.layer_norm", false]], "layer_quant_mode (tensorrt_llm.llmapi.quantconfig property)": [[65, "tensorrt_llm.llmapi.QuantConfig.layer_quant_mode", false]], "layer_types (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.layer_types", false]], "layernorm (class in tensorrt_llm.layers.normalization)": [[78, "tensorrt_llm.layers.normalization.LayerNorm", false]], "layernorm (tensorrt_llm.functional.layernormtype attribute)": [[77, "tensorrt_llm.functional.LayerNormType.LayerNorm", false]], "layernormpositiontype (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.LayerNormPositionType", false]], "layernormtype (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.LayerNormType", false]], "learned_absolute (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.learned_absolute", false]], "length (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.length", false]], "length (tensorrt_llm.llmapi.completionoutput property)": [[65, "id2", false]], "length_penalty (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.length_penalty", false]], "length_penalty (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.length_penalty", false]], "linear (class in tensorrt_llm.layers.linear)": [[78, "tensorrt_llm.layers.linear.Linear", false]], "linear (tensorrt_llm.functional.rotaryscalingtype attribute)": [[77, "tensorrt_llm.functional.RotaryScalingType.linear", false]], "linearactivation (class in tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.LinearActivation", false]], "linearapproximategelu (class in tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.LinearApproximateGELU", false]], "linearbase (class in tensorrt_llm.layers.linear)": [[78, "tensorrt_llm.layers.linear.LinearBase", false]], "lineargeglu (class in tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.LinearGEGLU", false]], "lineargelu (class in tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.LinearGELU", false]], "linearswiglu (class in tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.LinearSwiGLU", false]], "llama3 (tensorrt_llm.functional.rotaryscalingtype attribute)": [[77, "tensorrt_llm.functional.RotaryScalingType.llama3", false]], "llamaconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.LLaMAConfig", false]], "llamaforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.LLaMAForCausalLM", false]], "llamamodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.LLaMAModel", false]], "llavanextvisionconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.LlavaNextVisionConfig", false]], "llavanextvisionwrapper (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.LlavaNextVisionWrapper", false]], "llm (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.LLM", false]], "llm_engine_dir (tensorrt_llm.runtime.multimodalmodelrunner property)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.llm_engine_dir", false]], "load() (tensorrt_llm.models.pretrainedmodel method)": [[79, "tensorrt_llm.models.PretrainedModel.load", false]], "load() (tensorrt_llm.models.sd3transformer2dmodel method)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.load", false]], "load_test_audio() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.load_test_audio", false]], "load_test_data() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.load_test_data", false]], "locate_accepted_draft_tokens() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.locate_accepted_draft_tokens", false]], "location (tensorrt_llm.functional.tensor property)": [[77, "tensorrt_llm.functional.Tensor.location", false]], "log() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.log", false]], "log() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.log", false]], "log_softmax() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.log_softmax", false]], "logits_processor (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.logits_processor", false]], "logitsprocessor (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.LogitsProcessor", false]], "logitsprocessorlist (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.LogitsProcessorList", false]], "logprobs (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.logprobs", false]], "logprobs (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.logprobs", false]], "logprobs_diff (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.logprobs_diff", false]], "logprobs_diff (tensorrt_llm.llmapi.completionoutput property)": [[65, "id3", false]], "long_rope (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.long_rope", false]], "longrope (tensorrt_llm.functional.rotaryscalingtype attribute)": [[77, "tensorrt_llm.functional.RotaryScalingType.longrope", false]], "lookahead_config (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.lookahead_config", false]], "lookahead_decoding (tensorrt_llm.models.speculativedecodingmode attribute)": [[79, "tensorrt_llm.models.SpeculativeDecodingMode.LOOKAHEAD_DECODING", false]], "lookaheaddecodingconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig", false]], "lora_config (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.lora_config", false]], "lora_plugin (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.lora_plugin", false]], "lora_plugin() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.lora_plugin", false]], "lora_target_modules (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.lora_target_modules", false]], "low_latency_gemm() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.low_latency_gemm", false]], "low_latency_gemm_swiglu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.low_latency_gemm_swiglu", false]], "lt() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.lt", false]], "make_causal_mask() (in module tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.make_causal_mask", false]], "mamba_conv1d() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.mamba_conv1d", false]], "mamba_conv1d_plugin (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.mamba_conv1d_plugin", false]], "mambaforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.MambaForCausalLM", false]], "mapping (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.mapping", false]], "mapping (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.mapping", false]], "mark_output() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.mark_output", false]], "masked_scatter() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.masked_scatter", false]], "masked_select() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.masked_select", false]], "matmul() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.matmul", false]], "max() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.max", false]], "max() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.max", false]], "max_attention_window (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.max_attention_window", false]], "max_attention_window_size (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.max_attention_window_size", false]], "max_batch_size (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.max_batch_size", false]], "max_batch_size (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.max_batch_size", false]], "max_beam_width (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.max_beam_width", false]], "max_beam_width (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.max_beam_width", false]], "max_cache_storage_gb (tensorrt_llm.llmapi.buildcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildCacheConfig.max_cache_storage_gb", false]], "max_cache_storage_gb (tensorrt_llm.llmapi.buildcacheconfig property)": [[65, "id8", false]], "max_draft_len (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.max_draft_len", false]], "max_draft_tokens (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.max_draft_tokens", false]], "max_encoder_input_len (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.max_encoder_input_len", false]], "max_input_len (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.max_input_len", false]], "max_medusa_tokens (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.max_medusa_tokens", false]], "max_new_tokens (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.max_new_tokens", false]], "max_ngram_size (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size", false]], "max_non_leaves_per_layer (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.max_non_leaves_per_layer", false]], "max_num_tokens (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.max_num_tokens", false]], "max_num_tokens (tensorrt_llm.llmapi.cachetransceiverconfig attribute)": [[65, "tensorrt_llm.llmapi.CacheTransceiverConfig.max_num_tokens", false]], "max_prompt_embedding_table_size (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.max_prompt_embedding_table_size", false]], "max_prompt_embedding_table_size (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.max_prompt_embedding_table_size", false]], "max_prompt_embedding_table_size (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.max_prompt_embedding_table_size", false]], "max_prompt_embedding_table_size (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.max_prompt_embedding_table_size", false]], "max_prompt_embedding_table_size (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.max_prompt_embedding_table_size", false]], "max_records (tensorrt_llm.llmapi.buildcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildCacheConfig.max_records", false]], "max_records (tensorrt_llm.llmapi.buildcacheconfig property)": [[65, "id9", false]], "max_seq_len (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.max_seq_len", false]], "max_sequence_length (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.max_sequence_length", false]], "max_sequence_length (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.max_sequence_length", false]], "max_tokens (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.max_tokens", false]], "max_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.max_tokens", false]], "max_utilization (tensorrt_llm.llmapi.capacityschedulerpolicy attribute)": [[65, "tensorrt_llm.llmapi.CapacitySchedulerPolicy.MAX_UTILIZATION", false]], "max_verification_set_size (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size", false]], "max_window_size (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size", false]], "maximum() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.maximum", false]], "mean() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.mean", false]], "mean() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.mean", false]], "medusa (tensorrt_llm.models.speculativedecodingmode attribute)": [[79, "tensorrt_llm.models.SpeculativeDecodingMode.MEDUSA", false]], "medusa_choices (tensorrt_llm.llmapi.medusadecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MedusaDecodingConfig.medusa_choices", false]], "medusa_decode_and_verify() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.medusa_decode_and_verify", false]], "medusa_paths (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.medusa_paths", false]], "medusa_position_offsets (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.medusa_position_offsets", false]], "medusa_temperature (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.medusa_temperature", false]], "medusa_topks (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.medusa_topks", false]], "medusa_tree_ids (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.medusa_tree_ids", false]], "medusaconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.MedusaConfig", false]], "medusadecodingconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.MedusaDecodingConfig", false]], "medusaforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.MedusaForCausalLm", false]], "meshgrid2d() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.meshgrid2d", false]], "min() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.min", false]], "min_latency (tensorrt_llm.functional.allreducestrategy attribute)": [[77, "tensorrt_llm.functional.AllReduceStrategy.MIN_LATENCY", false]], "min_length (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.min_length", false]], "min_p (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.min_p", false]], "min_p (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.min_p", false]], "min_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.min_tokens", false]], "minimum() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.minimum", false]], "mish (class in tensorrt_llm.layers.activation)": [[78, "tensorrt_llm.layers.activation.Mish", false]], "mixed_precision (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.MIXED_PRECISION", false]], "mllamaforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.MLLaMAForCausalLM", false]], "mlp (class in tensorrt_llm.layers.mlp)": [[78, "tensorrt_llm.layers.mlp.MLP", false]], "mlp (tensorrt_llm.functional.mlptype attribute)": [[77, "tensorrt_llm.functional.MLPType.MLP", false]], "mlptype (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.MLPType", false]], "model": [[26, "cmdoption-trtllm-serve-serve-arg-MODEL", false]], "model_config (tensorrt_llm.llmapi.cachetransceiverconfig attribute)": [[65, "tensorrt_llm.llmapi.CacheTransceiverConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.calibconfig attribute)": [[65, "tensorrt_llm.llmapi.CalibConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.dynamicbatchconfig attribute)": [[65, "tensorrt_llm.llmapi.DynamicBatchConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[65, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.medusadecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MedusaDecodingConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MTPDecodingConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.schedulerconfig attribute)": [[65, "tensorrt_llm.llmapi.SchedulerConfig.model_config", false]], "model_name (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.model_name", false]], "modelconfig (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.ModelConfig", false]], "modelrunner (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.ModelRunner", false]], "modelrunnercpp (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp", false]], "module": [[77, "module-tensorrt_llm", false], [77, "module-tensorrt_llm.functional", false], [78, "module-tensorrt_llm", false], [78, "module-tensorrt_llm.layers.activation", false], [78, "module-tensorrt_llm.layers.attention", false], [78, "module-tensorrt_llm.layers.cast", false], [78, "module-tensorrt_llm.layers.conv", false], [78, "module-tensorrt_llm.layers.embedding", false], [78, "module-tensorrt_llm.layers.linear", false], [78, "module-tensorrt_llm.layers.mlp", false], [78, "module-tensorrt_llm.layers.normalization", false], [78, "module-tensorrt_llm.layers.pooling", false], [79, "module-tensorrt_llm", false], [79, "module-tensorrt_llm.models", false], [80, "module-tensorrt_llm", false], [80, "module-tensorrt_llm.plugin", false], [81, "module-tensorrt_llm", false], [81, "module-tensorrt_llm.quantization", false], [82, "module-tensorrt_llm", false], [82, "module-tensorrt_llm.runtime", false]], "modulo() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.modulo", false]], "moe (tensorrt_llm.functional.sidestreamidtype attribute)": [[77, "tensorrt_llm.functional.SideStreamIDType.moe", false]], "moe_allreduce_residual_rms_norm (tensorrt_llm.functional.allreducefusionop attribute)": [[77, "tensorrt_llm.functional.AllReduceFusionOp.MOE_ALLREDUCE_RESIDUAL_RMS_NORM", false]], "monitor_memory (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.monitor_memory", false]], "mpicommsession (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.MpiCommSession", false]], "mptforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.MPTForCausalLM", false]], "mptmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.MPTModel", false]], "mrope (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.mrope", false]], "mrope (tensorrt_llm.functional.rotaryscalingtype attribute)": [[77, "tensorrt_llm.functional.RotaryScalingType.mrope", false]], "mropeparams (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.MropeParams", false]], "mtpdecodingconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.MTPDecodingConfig", false]], "mul() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.mul", false]], "multi_block_mode (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[65, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.multi_block_mode", false]], "multimodalmodelrunner (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner", false]], "multiply_and_lora() (tensorrt_llm.layers.linear.linearbase method)": [[78, "tensorrt_llm.layers.linear.LinearBase.multiply_and_lora", false]], "multiply_collect() (tensorrt_llm.layers.linear.linearbase method)": [[78, "tensorrt_llm.layers.linear.LinearBase.multiply_collect", false]], "multiply_collect() (tensorrt_llm.layers.linear.rowlinear method)": [[78, "tensorrt_llm.layers.linear.RowLinear.multiply_collect", false]], "n (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.n", false]], "name (tensorrt_llm.functional.tensor property)": [[77, "tensorrt_llm.functional.Tensor.name", false]], "name (tensorrt_llm.runtime.tensorinfo attribute)": [[82, "tensorrt_llm.runtime.TensorInfo.name", false]], "native_quant_flow (tensorrt_llm.models.gemmaforcausallm attribute)": [[79, "tensorrt_llm.models.GemmaForCausalLM.NATIVE_QUANT_FLOW", false]], "nccl (tensorrt_llm.functional.allreducestrategy attribute)": [[77, "tensorrt_llm.functional.AllReduceStrategy.NCCL", false]], "ndim() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.ndim", false]], "network (tensorrt_llm.functional.tensor property)": [[77, "tensorrt_llm.functional.Tensor.network", false]], "next_medusa_input_ids() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.next_medusa_input_ids", false]], "no_quant (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.NO_QUANT", false]], "no_repeat_ngram_size (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.no_repeat_ngram_size", false]], "no_repeat_ngram_size (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.no_repeat_ngram_size", false]], "non_gated_version() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.non_gated_version", false]], "none (tensorrt_llm.functional.allreducefusionop attribute)": [[77, "tensorrt_llm.functional.AllReduceFusionOp.NONE", false]], "none (tensorrt_llm.functional.rotaryscalingtype attribute)": [[77, "tensorrt_llm.functional.RotaryScalingType.none", false]], "none (tensorrt_llm.models.speculativedecodingmode attribute)": [[79, "tensorrt_llm.models.SpeculativeDecodingMode.NONE", false]], "nonzero() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.nonzero", false]], "not_op() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.not_op", false]], "num_beams (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.num_beams", false]], "num_draft_tokens (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.num_draft_tokens", false]], "num_eagle_layers (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.num_eagle_layers", false]], "num_heads (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.num_heads", false]], "num_heads (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.num_heads", false]], "num_heads (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.num_heads", false]], "num_heads (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.num_heads", false]], "num_kv_heads (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.num_kv_heads", false]], "num_kv_heads_per_cross_attn_layer (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_cross_attn_layer", false]], "num_kv_heads_per_layer (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_layer", false]], "num_layers (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.num_layers", false]], "num_layers (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.num_layers", false]], "num_layers (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.num_layers", false]], "num_layers (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.num_layers", false]], "num_medusa_heads (tensorrt_llm.llmapi.medusadecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MedusaDecodingConfig.num_medusa_heads", false]], "num_medusa_heads (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.num_medusa_heads", false]], "num_medusa_heads (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.num_medusa_heads", false]], "num_nextn_predict_layers (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MTPDecodingConfig.num_nextn_predict_layers", false]], "num_return_sequences (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.num_return_sequences", false]], "numel() (tensorrt_llm.runtime.tensorinfo method)": [[82, "tensorrt_llm.runtime.TensorInfo.numel", false]], "nvfp4 (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.NVFP4", false]], "nvinfer1 (c++ type)": [[1, "_CPPv48nvinfer1", false]], "onboard_blocks (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.onboard_blocks", false]], "oneshot (tensorrt_llm.functional.allreducestrategy attribute)": [[77, "tensorrt_llm.functional.AllReduceStrategy.ONESHOT", false]], "op_and() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.op_and", false]], "op_or() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.op_or", false]], "op_xor() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.op_xor", false]], "opaque_state (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[65, "tensorrt_llm.llmapi.DisaggregatedParams.opaque_state", false]], "opt_batch_size (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.opt_batch_size", false]], "opt_num_tokens (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.opt_num_tokens", false]], "optforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.OPTForCausalLM", false]], "optmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.OPTModel", false]], "outer() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.outer", false]], "output_cum_log_probs (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.output_cum_log_probs", false]], "output_log_probs (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.output_log_probs", false]], "output_sequence_lengths (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.output_sequence_lengths", false]], "output_timing_cache (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.output_timing_cache", false]], "outputs (tensorrt_llm.llmapi.requestoutput attribute)": [[65, "tensorrt_llm.llmapi.RequestOutput.outputs", false]], "pad() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.pad", false]], "pad_id (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.pad_id", false]], "pad_id (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.pad_id", false]], "padding (tensorrt_llm.functional.attentionmasktype attribute)": [[77, "tensorrt_llm.functional.AttentionMaskType.padding", false]], "paged_kv_cache (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.paged_kv_cache", false]], "paged_state (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.paged_state", false]], "paged_state (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.paged_state", false]], "permute() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.permute", false]], "permute() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.permute", false]], "phi3forcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.Phi3ForCausalLM", false]], "phi3model (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.Phi3Model", false]], "phiforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.PhiForCausalLM", false]], "phimodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.PhiModel", false]], "pixartalphatextprojection (class in tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.PixArtAlphaTextProjection", false]], "plugin_config (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.plugin_config", false]], "pluginconfig (class in tensorrt_llm.plugin)": [[80, "tensorrt_llm.plugin.PluginConfig", false]], "positionembeddingtype (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.PositionEmbeddingType", false]], "post_layernorm (tensorrt_llm.functional.layernormpositiontype attribute)": [[77, "tensorrt_llm.functional.LayerNormPositionType.post_layernorm", false]], "posterior_threshold (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.posterior_threshold", false]], "postprocess() (tensorrt_llm.layers.attention.attention method)": [[78, "tensorrt_llm.layers.attention.Attention.postprocess", false]], "postprocess() (tensorrt_llm.layers.attention.deepseekv2attention method)": [[78, "tensorrt_llm.layers.attention.DeepseekV2Attention.postprocess", false]], "postprocess() (tensorrt_llm.layers.embedding.embedding method)": [[78, "tensorrt_llm.layers.embedding.Embedding.postprocess", false]], "postprocess() (tensorrt_llm.layers.linear.linear method)": [[78, "tensorrt_llm.layers.linear.Linear.postprocess", false]], "pow() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.pow", false]], "pp_communicate_final_output_ids() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.pp_communicate_final_output_ids", false]], "pp_communicate_new_tokens() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.pp_communicate_new_tokens", false]], "pre_layernorm (tensorrt_llm.functional.layernormpositiontype attribute)": [[77, "tensorrt_llm.functional.LayerNormPositionType.pre_layernorm", false]], "pre_quant_scale (tensorrt_llm.llmapi.quantconfig attribute)": [[65, "tensorrt_llm.llmapi.QuantConfig.pre_quant_scale", false]], "precompute_relative_attention_bias() (tensorrt_llm.models.decodermodel method)": [[79, "tensorrt_llm.models.DecoderModel.precompute_relative_attention_bias", false]], "precompute_relative_attention_bias() (tensorrt_llm.models.encodermodel method)": [[79, "tensorrt_llm.models.EncoderModel.precompute_relative_attention_bias", false]], "precompute_relative_attention_bias() (tensorrt_llm.models.whisperencoder method)": [[79, "tensorrt_llm.models.WhisperEncoder.precompute_relative_attention_bias", false]], "prepare_inputs() (tensorrt_llm.models.chatglmforcausallm method)": [[79, "tensorrt_llm.models.ChatGLMForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.decodermodel method)": [[79, "tensorrt_llm.models.DecoderModel.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.dit method)": [[79, "tensorrt_llm.models.DiT.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.eagleforcausallm method)": [[79, "tensorrt_llm.models.EagleForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.encodermodel method)": [[79, "tensorrt_llm.models.EncoderModel.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.llavanextvisionwrapper method)": [[79, "tensorrt_llm.models.LlavaNextVisionWrapper.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.mambaforcausallm method)": [[79, "tensorrt_llm.models.MambaForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.mllamaforcausallm method)": [[79, "tensorrt_llm.models.MLLaMAForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.pretrainedmodel method)": [[79, "tensorrt_llm.models.PretrainedModel.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.recurrentgemmaforcausallm method)": [[79, "tensorrt_llm.models.RecurrentGemmaForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.redrafterforcausallm method)": [[79, "tensorrt_llm.models.ReDrafterForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.sd3transformer2dmodel method)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.whisperencoder method)": [[79, "tensorrt_llm.models.WhisperEncoder.prepare_inputs", false]], "prepare_position_ids_for_cogvlm() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.prepare_position_ids_for_cogvlm", false]], "prepare_recurrent_inputs() (tensorrt_llm.models.recurrentgemmaforcausallm method)": [[79, "tensorrt_llm.models.RecurrentGemmaForCausalLM.prepare_recurrent_inputs", false]], "preprocess() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.preprocess", false]], "presence_penalty (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.presence_penalty", false]], "presence_penalty (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.presence_penalty", false]], "pretrainedconfig (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.PretrainedConfig", false]], "pretrainedmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.PretrainedModel", false]], "priority (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig property)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.priority", false]], "process_input() (tensorrt_llm.runtime.encdecmodelrunner method)": [[82, "tensorrt_llm.runtime.EncDecModelRunner.process_input", false]], "process_logits_including_draft() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.process_logits_including_draft", false]], "prod() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.prod", false]], "profiler (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.profiler", false]], "profiling_verbosity (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.profiling_verbosity", false]], "prompt (tensorrt_llm.llmapi.requestoutput attribute)": [[65, "tensorrt_llm.llmapi.RequestOutput.prompt", false]], "prompt (tensorrt_llm.llmapi.requestoutput property)": [[65, "id6", false]], "prompt_logprobs (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.prompt_logprobs", false]], "prompt_logprobs (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.prompt_logprobs", false]], "prompt_token_ids (tensorrt_llm.llmapi.requestoutput attribute)": [[65, "tensorrt_llm.llmapi.RequestOutput.prompt_token_ids", false]], "prompttuningembedding (class in tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.PromptTuningEmbedding", false]], "ptuning_setup() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup", false]], "ptuning_setup_fuyu() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_fuyu", false]], "ptuning_setup_llava_next() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_llava_next", false]], "ptuning_setup_phi3() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_phi3", false]], "ptuning_setup_pixtral() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_pixtral", false]], "python_e2e (tensorrt_llm.runtime.multimodalmodelrunner property)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.python_e2e", false]], "pytorch_eagle_weights_path (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.pytorch_eagle_weights_path", false]], "quant_algo (tensorrt_llm.llmapi.quantconfig attribute)": [[65, "tensorrt_llm.llmapi.QuantConfig.quant_algo", false]], "quant_algo (tensorrt_llm.models.pretrainedconfig property)": [[79, "tensorrt_llm.models.PretrainedConfig.quant_algo", false]], "quant_mode (tensorrt_llm.llmapi.quantconfig property)": [[65, "tensorrt_llm.llmapi.QuantConfig.quant_mode", false]], "quant_mode (tensorrt_llm.models.pretrainedconfig property)": [[79, "tensorrt_llm.models.PretrainedConfig.quant_mode", false]], "quant_mode (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.quant_mode", false]], "quant_mode (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.quant_mode", false]], "quantalgo (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.QuantAlgo", false]], "quantalgo (class in tensorrt_llm.quantization)": [[81, "tensorrt_llm.quantization.QuantAlgo", false]], "quantconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.QuantConfig", false]], "quantize() (tensorrt_llm.models.baichuanforcausallm class method)": [[79, "tensorrt_llm.models.BaichuanForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.chatglmforcausallm class method)": [[79, "tensorrt_llm.models.ChatGLMForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.cogvlmforcausallm class method)": [[79, "tensorrt_llm.models.CogVLMForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.gemmaforcausallm class method)": [[79, "tensorrt_llm.models.GemmaForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.gptforcausallm class method)": [[79, "tensorrt_llm.models.GPTForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.llamaforcausallm class method)": [[79, "tensorrt_llm.models.LLaMAForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.pretrainedmodel class method)": [[79, "tensorrt_llm.models.PretrainedModel.quantize", false]], "quantize_and_export() (in module tensorrt_llm.quantization)": [[81, "tensorrt_llm.quantization.quantize_and_export", false]], "quantmode (class in tensorrt_llm.quantization)": [[81, "tensorrt_llm.quantization.QuantMode", false]], "quick_gelu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.quick_gelu", false]], "qwenforcausallmgenerationsession (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.QWenForCausalLMGenerationSession", false]], "rand() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.rand", false]], "random_seed (tensorrt_llm.llmapi.calibconfig attribute)": [[65, "tensorrt_llm.llmapi.CalibConfig.random_seed", false]], "random_seed (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.random_seed", false]], "rank() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.rank", false]], "rearrange() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.rearrange", false]], "recurrentgemmaforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.RecurrentGemmaForCausalLM", false]], "recv() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.recv", false]], "redrafter_draft_len_per_beam (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.redrafter_draft_len_per_beam", false]], "redrafter_num_beams (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.redrafter_num_beams", false]], "redrafterforcausallm (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.ReDrafterForCausalLM", false]], "reduce() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.reduce", false]], "reduce_scatter() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.reduce_scatter", false]], "regex (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[65, "tensorrt_llm.llmapi.GuidedDecodingParams.regex", false]], "relative (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.relative", false]], "relaxed_delta (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_delta", false]], "relaxed_topk (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_topk", false]], "release() (tensorrt_llm.models.pretrainedmodel method)": [[79, "tensorrt_llm.models.PretrainedModel.release", false]], "relu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.relu", false]], "remove_input_padding (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.remove_input_padding", false]], "remove_input_padding (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.remove_input_padding", false]], "remove_input_padding (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.remove_input_padding", false]], "remove_input_padding (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.remove_input_padding", false]], "reorder_kv_cache_for_beam_search() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.reorder_kv_cache_for_beam_search", false]], "repeat() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.repeat", false]], "repeat() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.repeat", false]], "repeat_interleave() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.repeat_interleave", false]], "repetition_penalty (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.repetition_penalty", false]], "repetition_penalty (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.repetition_penalty", false]], "replace_all_uses_with() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.replace_all_uses_with", false]], "request_id (tensorrt_llm.llmapi.requestoutput attribute)": [[65, "tensorrt_llm.llmapi.RequestOutput.request_id", false]], "request_type (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[65, "tensorrt_llm.llmapi.DisaggregatedParams.request_type", false]], "requesterror (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.RequestError", false]], "requestoutput (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.RequestOutput", false]], "residual_rms_norm (tensorrt_llm.functional.allreducefusionop attribute)": [[77, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM", false]], "residual_rms_norm_out_quant_fp8 (tensorrt_llm.functional.allreducefusionop attribute)": [[77, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_FP8", false]], "residual_rms_norm_out_quant_nvfp4 (tensorrt_llm.functional.allreducefusionop attribute)": [[77, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4", false]], "residual_rms_norm_quant_fp8 (tensorrt_llm.functional.allreducefusionop attribute)": [[77, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8", false]], "residual_rms_norm_quant_nvfp4 (tensorrt_llm.functional.allreducefusionop attribute)": [[77, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4", false]], "residual_rms_prepost_norm (tensorrt_llm.functional.allreducefusionop attribute)": [[77, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_PREPOST_NORM", false]], "return_context_logits (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.return_context_logits", false]], "return_dict (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.return_dict", false]], "return_encoder_output (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.return_encoder_output", false]], "return_generation_logits (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.return_generation_logits", false]], "return_perf_metrics (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.return_perf_metrics", false]], "rg_lru() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.rg_lru", false]], "rms_norm() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.rms_norm", false]], "rmsnorm (class in tensorrt_llm.layers.normalization)": [[78, "tensorrt_llm.layers.normalization.RmsNorm", false]], "rmsnorm (tensorrt_llm.functional.layernormtype attribute)": [[77, "tensorrt_llm.functional.LayerNormType.RmsNorm", false]], "rnn_conv_dim_size (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.rnn_conv_dim_size", false]], "rnn_conv_dim_size (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.rnn_conv_dim_size", false]], "rnn_head_size (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.rnn_head_size", false]], "rnn_head_size (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.rnn_head_size", false]], "rnn_hidden_size (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.rnn_hidden_size", false]], "rnn_hidden_size (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.rnn_hidden_size", false]], "robertaforquestionanswering (in module tensorrt_llm.models)": [[79, "tensorrt_llm.models.RobertaForQuestionAnswering", false]], "robertaforsequenceclassification (in module tensorrt_llm.models)": [[79, "tensorrt_llm.models.RobertaForSequenceClassification", false]], "robertamodel (in module tensorrt_llm.models)": [[79, "tensorrt_llm.models.RobertaModel", false]], "rope_gpt_neox (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.rope_gpt_neox", false]], "rope_gptj (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.rope_gptj", false]], "ropeembeddingutils (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils", false]], "rotaryscalingtype (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.RotaryScalingType", false]], "rotate_every_two() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.rotate_every_two", false]], "rotate_half() (tensorrt_llm.functional.ropeembeddingutils static method)": [[77, "tensorrt_llm.functional.RopeEmbeddingUtils.rotate_half", false]], "round() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.round", false]], "rowlinear (class in tensorrt_llm.layers.linear)": [[78, "tensorrt_llm.layers.linear.RowLinear", false]], "run() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.run", false]], "run() (tensorrt_llm.runtime.session method)": [[82, "tensorrt_llm.runtime.Session.run", false]], "runtime (tensorrt_llm.runtime.generationsession attribute)": [[82, "tensorrt_llm.runtime.GenerationSession.runtime", false]], "runtime (tensorrt_llm.runtime.session property)": [[82, "tensorrt_llm.runtime.Session.runtime", false]], "samplingconfig (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.SamplingConfig", false]], "samplingparams (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.SamplingParams", false]], "save() (tensorrt_llm.llmapi.llm method)": [[65, "tensorrt_llm.llmapi.LLM.save", false]], "save_checkpoint() (tensorrt_llm.models.llavanextvisionwrapper method)": [[79, "tensorrt_llm.models.LlavaNextVisionWrapper.save_checkpoint", false]], "save_checkpoint() (tensorrt_llm.models.pretrainedmodel method)": [[79, "tensorrt_llm.models.PretrainedModel.save_checkpoint", false]], "scatter() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.scatter", false]], "scatter_nd() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.scatter_nd", false]], "schedulerconfig (class in tensorrt_llm.llmapi)": [[65, "tensorrt_llm.llmapi.SchedulerConfig", false]], "sd35adalayernormzerox (class in tensorrt_llm.layers.normalization)": [[78, "tensorrt_llm.layers.normalization.SD35AdaLayerNormZeroX", false]], "sd3patchembed (class in tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.SD3PatchEmbed", false]], "sd3transformer2dmodel (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.SD3Transformer2DModel", false]], "secondary_offload_min_priority (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.secondary_offload_min_priority", false]], "seed (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.seed", false]], "select() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.select", false]], "select() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.select", false]], "selective_scan() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.selective_scan", false]], "send() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.send", false]], "serialize_engine() (tensorrt_llm.runtime.modelrunner method)": [[82, "tensorrt_llm.runtime.ModelRunner.serialize_engine", false]], "session (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.Session", false]], "set_attn_processor() (tensorrt_llm.models.sd3transformer2dmodel method)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.set_attn_processor", false]], "set_from_optional (c macro)": [[1, "c.SET_FROM_OPTIONAL", false]], "set_if_not_exist() (tensorrt_llm.models.pretrainedconfig method)": [[79, "tensorrt_llm.models.PretrainedConfig.set_if_not_exist", false]], "set_rank() (tensorrt_llm.models.pretrainedconfig method)": [[79, "tensorrt_llm.models.PretrainedConfig.set_rank", false]], "set_rel_attn_table() (tensorrt_llm.layers.attention.attention method)": [[78, "tensorrt_llm.layers.attention.Attention.set_rel_attn_table", false]], "set_shapes() (tensorrt_llm.runtime.session method)": [[82, "tensorrt_llm.runtime.Session.set_shapes", false]], "setup() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.setup", false]], "setup_fake_prompts() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts", false]], "setup_fake_prompts_qwen2vl() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_qwen2vl", false]], "setup_fake_prompts_vila() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_vila", false]], "setup_inputs() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.setup_inputs", false]], "shape (tensorrt_llm.functional.tensor property)": [[77, "tensorrt_llm.functional.Tensor.shape", false]], "shape (tensorrt_llm.runtime.tensorinfo attribute)": [[82, "tensorrt_llm.runtime.TensorInfo.shape", false]], "shape() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.shape", false]], "shutdown() (tensorrt_llm.llmapi.llm method)": [[65, "tensorrt_llm.llmapi.LLM.shutdown", false]], "shutdown() (tensorrt_llm.llmapi.mpicommsession method)": [[65, "tensorrt_llm.llmapi.MpiCommSession.shutdown", false]], "sidestreamidtype (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.SideStreamIDType", false]], "sigmoid() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.sigmoid", false]], "silu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.silu", false]], "sin() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.sin", false]], "sink_token_length (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[65, "tensorrt_llm.llmapi.KvCacheConfig.sink_token_length", false]], "sink_token_length (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.sink_token_length", false]], "size (tensorrt_llm.functional.sliceinputtype attribute)": [[77, "tensorrt_llm.functional.SliceInputType.size", false]], "size() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.size", false]], "skip_cross_attn_blocks (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.skip_cross_attn_blocks", false]], "skip_cross_kv (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.skip_cross_kv", false]], "skip_special_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.skip_special_tokens", false]], "slice() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.slice", false]], "sliceinputtype (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.SliceInputType", false]], "sliding_window_causal (tensorrt_llm.functional.attentionmasktype attribute)": [[77, "tensorrt_llm.functional.AttentionMaskType.sliding_window_causal", false]], "smoothquant_val (tensorrt_llm.llmapi.quantconfig attribute)": [[65, "tensorrt_llm.llmapi.QuantConfig.smoothquant_val", false]], "softmax() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.softmax", false]], "softplus() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.softplus", false]], "spaces_between_special_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.spaces_between_special_tokens", false]], "specdecodingparams (class in tensorrt_llm.layers.attention)": [[78, "tensorrt_llm.layers.attention.SpecDecodingParams", false]], "speculative_decoding_mode (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.speculative_decoding_mode", false]], "speculativedecodingmode (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.SpeculativeDecodingMode", false]], "split() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.split", false]], "split() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.split", false]], "split_prompt_by_images() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.split_prompt_by_images", false]], "sqrt() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.sqrt", false]], "sqrt() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.sqrt", false]], "squared_relu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.squared_relu", false]], "squeeze() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.squeeze", false]], "squeeze() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.squeeze", false]], "squeeze() (tensorrt_llm.runtime.tensorinfo method)": [[82, "tensorrt_llm.runtime.TensorInfo.squeeze", false]], "stack() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.stack", false]], "start (tensorrt_llm.functional.sliceinputtype attribute)": [[77, "tensorrt_llm.functional.SliceInputType.start", false]], "state_dtype (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.state_dtype", false]], "state_dtype (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.state_dtype", false]], "state_size (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.state_size", false]], "state_size (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.state_size", false]], "static (tensorrt_llm.llmapi.batchingtype attribute)": [[65, "tensorrt_llm.llmapi.BatchingType.STATIC", false]], "static_batch (tensorrt_llm.llmapi.capacityschedulerpolicy attribute)": [[65, "tensorrt_llm.llmapi.CapacitySchedulerPolicy.STATIC_BATCH", false]], "step() (tensorrt_llm.runtime.kvcachemanager method)": [[82, "tensorrt_llm.runtime.KVCacheManager.step", false]], "stop (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.stop", false]], "stop_reason (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.stop_reason", false]], "stop_token_ids (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.stop_token_ids", false]], "stop_words_list (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.stop_words_list", false]], "stoppingcriteria (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.StoppingCriteria", false]], "stoppingcriterialist (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.StoppingCriteriaList", false]], "stride (tensorrt_llm.functional.sliceinputtype attribute)": [[77, "tensorrt_llm.functional.SliceInputType.stride", false]], "strongly_typed (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.strongly_typed", false]], "structural_tag (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[65, "tensorrt_llm.llmapi.GuidedDecodingParams.structural_tag", false]], "sub() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.sub", false]], "submit() (tensorrt_llm.llmapi.mpicommsession method)": [[65, "tensorrt_llm.llmapi.MpiCommSession.submit", false]], "submit_sync() (tensorrt_llm.llmapi.mpicommsession method)": [[65, "tensorrt_llm.llmapi.MpiCommSession.submit_sync", false]], "sum() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.sum", false]], "swiglu() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.swiglu", false]], "tanh() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.tanh", false]], "temperature (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.temperature", false]], "temperature (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.temperature", false]], "tensor (class in tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.Tensor", false]], "tensorinfo (class in tensorrt_llm.runtime)": [[82, "tensorrt_llm.runtime.TensorInfo", false]], "tensorrt_llm": [[77, "module-tensorrt_llm", false], [78, "module-tensorrt_llm", false], [79, "module-tensorrt_llm", false], [80, "module-tensorrt_llm", false], [81, "module-tensorrt_llm", false], [82, "module-tensorrt_llm", false]], "tensorrt_llm (c++ type)": [[0, "_CPPv412tensorrt_llm", false], [1, "_CPPv412tensorrt_llm", false]], "tensorrt_llm.functional": [[77, "module-tensorrt_llm.functional", false]], "tensorrt_llm.layers.activation": [[78, "module-tensorrt_llm.layers.activation", false]], "tensorrt_llm.layers.attention": [[78, "module-tensorrt_llm.layers.attention", false]], "tensorrt_llm.layers.cast": [[78, "module-tensorrt_llm.layers.cast", false]], "tensorrt_llm.layers.conv": [[78, "module-tensorrt_llm.layers.conv", false]], "tensorrt_llm.layers.embedding": [[78, "module-tensorrt_llm.layers.embedding", false]], "tensorrt_llm.layers.linear": [[78, "module-tensorrt_llm.layers.linear", false]], "tensorrt_llm.layers.mlp": [[78, "module-tensorrt_llm.layers.mlp", false]], "tensorrt_llm.layers.normalization": [[78, "module-tensorrt_llm.layers.normalization", false]], "tensorrt_llm.layers.pooling": [[78, "module-tensorrt_llm.layers.pooling", false]], "tensorrt_llm.models": [[79, "module-tensorrt_llm.models", false]], "tensorrt_llm.plugin": [[80, "module-tensorrt_llm.plugin", false]], "tensorrt_llm.quantization": [[81, "module-tensorrt_llm.quantization", false]], "tensorrt_llm.runtime": [[82, "module-tensorrt_llm.runtime", false]], "tensorrt_llm::batch_manager (c++ type)": [[0, "_CPPv4N12tensorrt_llm13batch_managerE", false], [1, "_CPPv4N12tensorrt_llm13batch_managerE", false]], "tensorrt_llm::batch_manager::kv_cache_manager (c++ type)": [[0, "_CPPv4N12tensorrt_llm13batch_manager16kv_cache_managerE", false]], "tensorrt_llm::executor (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executorE", false]], "tensorrt_llm::executor::additionalmodeloutput (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutputE", false]], "tensorrt_llm::executor::additionalmodeloutput::additionalmodeloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput21AdditionalModelOutputENSt6stringEb", false]], "tensorrt_llm::executor::additionalmodeloutput::gathercontext (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput13gatherContextE", false]], "tensorrt_llm::executor::additionalmodeloutput::name (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput4nameE", false]], "tensorrt_llm::executor::additionalmodeloutput::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor21AdditionalModelOutputeqERK21AdditionalModelOutput", false]], "tensorrt_llm::executor::additionaloutput (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputE", false]], "tensorrt_llm::executor::additionaloutput::additionaloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputENSt6stringE6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERK16AdditionalOutput", false], [0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERR16AdditionalOutput", false]], "tensorrt_llm::executor::additionaloutput::name (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput4nameE", false]], "tensorrt_llm::executor::additionaloutput::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERK16AdditionalOutput", false], [0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERR16AdditionalOutput", false]], "tensorrt_llm::executor::additionaloutput::output (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput6outputE", false]], "tensorrt_llm::executor::additionaloutput::~additionaloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputD0Ev", false]], "tensorrt_llm::executor::batchingtype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor12BatchingTypeE", false]], "tensorrt_llm::executor::batchingtype::kinflight (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12BatchingType9kINFLIGHTE", false]], "tensorrt_llm::executor::batchingtype::kstatic (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12BatchingType7kSTATICE", false]], "tensorrt_llm::executor::beamtokens (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor10BeamTokensE", false]], "tensorrt_llm::executor::bufferview (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor10BufferViewE", false]], "tensorrt_llm::executor::cachetransceiverconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfigE", false]], "tensorrt_llm::executor::cachetransceiverconfig::cachetransceiverconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig22CacheTransceiverConfigENSt8optionalI6size_tEE", false]], "tensorrt_llm::executor::cachetransceiverconfig::getmaxnumtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfig15getMaxNumTokensEv", false]], "tensorrt_llm::executor::cachetransceiverconfig::mmaxnumtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig13mMaxNumTokensE", false]], "tensorrt_llm::executor::cachetransceiverconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfigeqERK22CacheTransceiverConfig", false]], "tensorrt_llm::executor::cachetransceiverconfig::setmaxnumtokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig15setMaxNumTokensE6size_t", false]], "tensorrt_llm::executor::capacityschedulerpolicy (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicyE", false]], "tensorrt_llm::executor::capacityschedulerpolicy::kguaranteed_no_evict (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy20kGUARANTEED_NO_EVICTE", false]], "tensorrt_llm::executor::capacityschedulerpolicy::kmax_utilization (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy16kMAX_UTILIZATIONE", false]], "tensorrt_llm::executor::capacityschedulerpolicy::kstatic_batch (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy13kSTATIC_BATCHE", false]], "tensorrt_llm::executor::communicationmode (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationModeE", false]], "tensorrt_llm::executor::communicationmode::kleader (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationMode7kLEADERE", false]], "tensorrt_llm::executor::communicationmode::korchestrator (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationMode13kORCHESTRATORE", false]], "tensorrt_llm::executor::communicationtype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationTypeE", false]], "tensorrt_llm::executor::communicationtype::kmpi (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationType4kMPIE", false]], "tensorrt_llm::executor::contextchunkingpolicy (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicyE", false]], "tensorrt_llm::executor::contextchunkingpolicy::kequal_progress (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicy15kEQUAL_PROGRESSE", false]], "tensorrt_llm::executor::contextchunkingpolicy::kfirst_come_first_served (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicy24kFIRST_COME_FIRST_SERVEDE", false]], "tensorrt_llm::executor::contextphaseparams (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsE", false]], "tensorrt_llm::executor::contextphaseparams::contextphaseparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsERK18ContextPhaseParams", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsERR18ContextPhaseParams", false]], "tensorrt_llm::executor::contextphaseparams::deleter (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams7deleterEPKv", false]], "tensorrt_llm::executor::contextphaseparams::getdrafttokens (c++ function)": [[0, "_CPPv4NKR12tensorrt_llm8executor18ContextPhaseParams14getDraftTokensEv", false]], "tensorrt_llm::executor::contextphaseparams::getfirstgentokens (c++ function)": [[0, "_CPPv4NKR12tensorrt_llm8executor18ContextPhaseParams17getFirstGenTokensEv", false]], "tensorrt_llm::executor::contextphaseparams::getreqid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams8getReqIdEv", false]], "tensorrt_llm::executor::contextphaseparams::getserializedstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams18getSerializedStateEv", false]], "tensorrt_llm::executor::contextphaseparams::getstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams8getStateEv", false], [0, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams8getStateEv", false]], "tensorrt_llm::executor::contextphaseparams::mdrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams12mDraftTokensE", false]], "tensorrt_llm::executor::contextphaseparams::mfirstgentokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams15mFirstGenTokensE", false]], "tensorrt_llm::executor::contextphaseparams::mreqid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams6mReqIdE", false]], "tensorrt_llm::executor::contextphaseparams::mstate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams6mStateE", false]], "tensorrt_llm::executor::contextphaseparams::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsaSERK18ContextPhaseParams", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsaSERR18ContextPhaseParams", false]], "tensorrt_llm::executor::contextphaseparams::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParamseqERK18ContextPhaseParams", false]], "tensorrt_llm::executor::contextphaseparams::popfirstgentokens (c++ function)": [[0, "_CPPv4NO12tensorrt_llm8executor18ContextPhaseParams17popFirstGenTokensEv", false]], "tensorrt_llm::executor::contextphaseparams::releasestate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams12releaseStateEv", false]], "tensorrt_llm::executor::contextphaseparams::requestidtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams13RequestIdTypeE", false]], "tensorrt_llm::executor::contextphaseparams::stateptr (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams8StatePtrE", false]], "tensorrt_llm::executor::contextphaseparams::~contextphaseparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsD0Ev", false]], "tensorrt_llm::executor::datatransceiverstate (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverStateE", false]], "tensorrt_llm::executor::datatransceiverstate::datatransceiverstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEN8kv_cache10CacheStateEN8kv_cache9CommStateE", false], [0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEv", false]], "tensorrt_llm::executor::datatransceiverstate::getcachestate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState13getCacheStateEv", false]], "tensorrt_llm::executor::datatransceiverstate::getcommstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState12getCommStateEv", false]], "tensorrt_llm::executor::datatransceiverstate::mcachestate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState11mCacheStateE", false]], "tensorrt_llm::executor::datatransceiverstate::mcommstate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState10mCommStateE", false]], "tensorrt_llm::executor::datatransceiverstate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverStateeqERK20DataTransceiverState", false]], "tensorrt_llm::executor::datatransceiverstate::setcachestate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState13setCacheStateEN8kv_cache10CacheStateE", false]], "tensorrt_llm::executor::datatransceiverstate::setcommstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState12setCommStateEN8kv_cache9CommStateE", false]], "tensorrt_llm::executor::datatransceiverstate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState8toStringEv", false]], "tensorrt_llm::executor::datatype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor8DataTypeE", false]], "tensorrt_llm::executor::datatype::kbf16 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kBF16E", false]], "tensorrt_llm::executor::datatype::kbool (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kBOOLE", false]], "tensorrt_llm::executor::datatype::kfp16 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kFP16E", false]], "tensorrt_llm::executor::datatype::kfp32 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kFP32E", false]], "tensorrt_llm::executor::datatype::kfp8 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType4kFP8E", false]], "tensorrt_llm::executor::datatype::kint32 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType6kINT32E", false]], "tensorrt_llm::executor::datatype::kint64 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType6kINT64E", false]], "tensorrt_llm::executor::datatype::kint8 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kINT8E", false]], "tensorrt_llm::executor::datatype::kuint8 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType6kUINT8E", false]], "tensorrt_llm::executor::datatype::kunknown (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType8kUNKNOWNE", false]], "tensorrt_llm::executor::debugconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfigE", false]], "tensorrt_llm::executor::debugconfig::debugconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", false]], "tensorrt_llm::executor::debugconfig::getdebuginputtensors (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfig20getDebugInputTensorsEv", false]], "tensorrt_llm::executor::debugconfig::getdebugoutputtensors (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfig21getDebugOutputTensorsEv", false]], "tensorrt_llm::executor::debugconfig::getdebugtensornames (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfig19getDebugTensorNamesEv", false]], "tensorrt_llm::executor::debugconfig::getdebugtensorsmaxiterations (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfig28getDebugTensorsMaxIterationsEv", false]], "tensorrt_llm::executor::debugconfig::mdebuginputtensors (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig18mDebugInputTensorsE", false]], "tensorrt_llm::executor::debugconfig::mdebugoutputtensors (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig19mDebugOutputTensorsE", false]], "tensorrt_llm::executor::debugconfig::mdebugtensornames (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig17mDebugTensorNamesE", false]], "tensorrt_llm::executor::debugconfig::mdebugtensorsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig26mDebugTensorsMaxIterationsE", false]], "tensorrt_llm::executor::debugconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfigeqERK11DebugConfig", false]], "tensorrt_llm::executor::debugconfig::setdebuginputtensors (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig20setDebugInputTensorsEb", false]], "tensorrt_llm::executor::debugconfig::setdebugoutputtensors (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig21setDebugOutputTensorsEb", false]], "tensorrt_llm::executor::debugconfig::setdebugtensornames (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig19setDebugTensorNamesERK9StringVec", false]], "tensorrt_llm::executor::debugconfig::setdebugtensorsmaxiterations (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig28setDebugTensorsMaxIterationsE10SizeType32", false]], "tensorrt_llm::executor::debugconfig::stringvec (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig9StringVecE", false]], "tensorrt_llm::executor::debugtensorsperiteration (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIterationE", false]], "tensorrt_llm::executor::debugtensorsperiteration::debugtensors (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIteration12debugTensorsE", false]], "tensorrt_llm::executor::debugtensorsperiteration::iter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIteration4iterE", false]], "tensorrt_llm::executor::decodingconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfigE", false]], "tensorrt_llm::executor::decodingconfig::decodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", false]], "tensorrt_llm::executor::decodingconfig::enableseamlesslookaheaddecoding (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig31enableSeamlessLookaheadDecodingEv", false]], "tensorrt_llm::executor::decodingconfig::getdecodingmode (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig15getDecodingModeEv", false]], "tensorrt_llm::executor::decodingconfig::geteagleconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig14getEagleConfigEv", false]], "tensorrt_llm::executor::decodingconfig::getlookaheaddecodingconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig26getLookaheadDecodingConfigEv", false]], "tensorrt_llm::executor::decodingconfig::getlookaheaddecodingmaxnumrequest (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig33getLookaheadDecodingMaxNumRequestEv", false]], "tensorrt_llm::executor::decodingconfig::getmedusachoices (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig16getMedusaChoicesEv", false]], "tensorrt_llm::executor::decodingconfig::mdecodingmode (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig13mDecodingModeE", false]], "tensorrt_llm::executor::decodingconfig::meagleconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig12mEagleConfigE", false]], "tensorrt_llm::executor::decodingconfig::mlookaheaddecodingconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig24mLookaheadDecodingConfigE", false]], "tensorrt_llm::executor::decodingconfig::mlookaheaddecodingmaxnumrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig31mLookaheadDecodingMaxNumRequestE", false]], "tensorrt_llm::executor::decodingconfig::mmedusachoices (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14mMedusaChoicesE", false]], "tensorrt_llm::executor::decodingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfigeqERK14DecodingConfig", false]], "tensorrt_llm::executor::decodingconfig::setdecodingmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig15setDecodingModeERK12DecodingMode", false]], "tensorrt_llm::executor::decodingconfig::seteagleconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14setEagleConfigERK11EagleConfig", false]], "tensorrt_llm::executor::decodingconfig::setlookaheaddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig26setLookaheadDecodingConfigERK23LookaheadDecodingConfig", false]], "tensorrt_llm::executor::decodingconfig::setmedusachoices (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig16setMedusaChoicesERK13MedusaChoices", false]], "tensorrt_llm::executor::decodingmode (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingModeE", false]], "tensorrt_llm::executor::decodingmode::allbitset (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9allBitSetE14UnderlyingType", false]], "tensorrt_llm::executor::decodingmode::anybitset (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9anyBitSetE14UnderlyingType", false]], "tensorrt_llm::executor::decodingmode::auto (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode4AutoEv", false]], "tensorrt_llm::executor::decodingmode::beamsearch (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode10BeamSearchEv", false]], "tensorrt_llm::executor::decodingmode::decodingmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12DecodingModeE14UnderlyingType", false]], "tensorrt_llm::executor::decodingmode::eagle (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode5EagleEv", false]], "tensorrt_llm::executor::decodingmode::explicitdrafttokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode19ExplicitDraftTokensEv", false]], "tensorrt_llm::executor::decodingmode::externaldrafttokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode19ExternalDraftTokensEv", false]], "tensorrt_llm::executor::decodingmode::getname (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode7getNameEv", false]], "tensorrt_llm::executor::decodingmode::getstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode8getStateEv", false]], "tensorrt_llm::executor::decodingmode::isauto (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isAutoEv", false]], "tensorrt_llm::executor::decodingmode::isbeamsearch (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isBeamSearchEv", false]], "tensorrt_llm::executor::decodingmode::iseagle (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode7isEagleEv", false]], "tensorrt_llm::executor::decodingmode::isexplicitdrafttokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isExplicitDraftTokensEv", false]], "tensorrt_llm::executor::decodingmode::isexternaldrafttokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isExternalDraftTokensEv", false]], "tensorrt_llm::executor::decodingmode::islookahead (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode11isLookaheadEv", false]], "tensorrt_llm::executor::decodingmode::ismedusa (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode8isMedusaEv", false]], "tensorrt_llm::executor::decodingmode::istopk (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isTopKEv", false]], "tensorrt_llm::executor::decodingmode::istopkandtopp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode13isTopKandTopPEv", false]], "tensorrt_llm::executor::decodingmode::istopkortopp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isTopKorTopPEv", false]], "tensorrt_llm::executor::decodingmode::istopp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isTopPEv", false]], "tensorrt_llm::executor::decodingmode::isusebantokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseBanTokensEv", false]], "tensorrt_llm::executor::decodingmode::isusebanwords (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode13isUseBanWordsEv", false]], "tensorrt_llm::executor::decodingmode::isuseexpliciteosstop (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode20isUseExplicitEosStopEv", false]], "tensorrt_llm::executor::decodingmode::isusefrequencypenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isUseFrequencyPenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isusemaxlengthstop (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode18isUseMaxLengthStopEv", false]], "tensorrt_llm::executor::decodingmode::isuseminlength (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseMinLengthEv", false]], "tensorrt_llm::executor::decodingmode::isuseminp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9isUseMinPEv", false]], "tensorrt_llm::executor::decodingmode::isusenorepeatngramsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseNoRepeatNgramSizeEv", false]], "tensorrt_llm::executor::decodingmode::isuseoccurrencepenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseOccurrencePenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isusepenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isUsePenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isusepresencepenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode20isUsePresencePenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isuserepetitionpenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseRepetitionPenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isusestopcriteria (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode17isUseStopCriteriaEv", false]], "tensorrt_llm::executor::decodingmode::isusestopwords (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseStopWordsEv", false]], "tensorrt_llm::executor::decodingmode::isusetemperature (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode16isUseTemperatureEv", false]], "tensorrt_llm::executor::decodingmode::isusevariablebeamwidthsearch (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode28isUseVariableBeamWidthSearchEv", false]], "tensorrt_llm::executor::decodingmode::kauto (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kAutoE", false]], "tensorrt_llm::executor::decodingmode::kbeamsearch (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode11kBeamSearchE", false]], "tensorrt_llm::executor::decodingmode::keagle (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode6kEagleE", false]], "tensorrt_llm::executor::decodingmode::kexplicitdrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode20kExplicitDraftTokensE", false]], "tensorrt_llm::executor::decodingmode::kexternaldrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode20kExternalDraftTokensE", false]], "tensorrt_llm::executor::decodingmode::klookahead (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode10kLookaheadE", false]], "tensorrt_llm::executor::decodingmode::kmedusa (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode7kMedusaE", false]], "tensorrt_llm::executor::decodingmode::knumflags (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode9kNumFlagsE", false]], "tensorrt_llm::executor::decodingmode::ktopk (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kTopKE", false]], "tensorrt_llm::executor::decodingmode::ktopktopp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode9kTopKTopPE", false]], "tensorrt_llm::executor::decodingmode::ktopp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kTopPE", false]], "tensorrt_llm::executor::decodingmode::kusebantokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseBanTokensE", false]], "tensorrt_llm::executor::decodingmode::kusebanwords (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12kUseBanWordsE", false]], "tensorrt_llm::executor::decodingmode::kuseexpliciteosstop (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode19kUseExplicitEosStopE", false]], "tensorrt_llm::executor::decodingmode::kusefrequencypenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode22kUseFrequencyPenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kusemaxlengthstop (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode17kUseMaxLengthStopE", false]], "tensorrt_llm::executor::decodingmode::kuseminlength (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseMinLengthE", false]], "tensorrt_llm::executor::decodingmode::kuseminp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode8kUseMinPE", false]], "tensorrt_llm::executor::decodingmode::kusenorepeatngramsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode21kUseNoRepeatNgramSizeE", false]], "tensorrt_llm::executor::decodingmode::kuseoccurrencepenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode23kUseOccurrencePenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kusepenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUsePenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kusepresencepenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode21kUsePresencePenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kuserepetitionpenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode23kUseRepetitionPenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kusestandardstopcriteria (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode24kUseStandardStopCriteriaE", false]], "tensorrt_llm::executor::decodingmode::kusestopwords (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseStopWordsE", false]], "tensorrt_llm::executor::decodingmode::kusetemperature (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode15kUseTemperatureE", false]], "tensorrt_llm::executor::decodingmode::kusevariablebeamwidthsearch (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode27kUseVariableBeamWidthSearchE", false]], "tensorrt_llm::executor::decodingmode::lookahead (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode9LookaheadEv", false]], "tensorrt_llm::executor::decodingmode::medusa (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode6MedusaEv", false]], "tensorrt_llm::executor::decodingmode::mstate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode6mStateE", false]], "tensorrt_llm::executor::decodingmode::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingModeeqERK12DecodingMode", false]], "tensorrt_llm::executor::decodingmode::setbitto (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode8setBitToE14UnderlyingTypeb", false]], "tensorrt_llm::executor::decodingmode::topk (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode4TopKEv", false]], "tensorrt_llm::executor::decodingmode::topktopp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode8TopKTopPEv", false]], "tensorrt_llm::executor::decodingmode::topp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode4TopPEv", false]], "tensorrt_llm::executor::decodingmode::underlyingtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode14UnderlyingTypeE", false]], "tensorrt_llm::executor::decodingmode::usebantokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useBanTokensEb", false]], "tensorrt_llm::executor::decodingmode::usebanwords (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode11useBanWordsEb", false]], "tensorrt_llm::executor::decodingmode::useexpliciteosstop (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode18useExplicitEosStopEb", false]], "tensorrt_llm::executor::decodingmode::usefrequencypenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode19useFrequencyPenaltyEb", false]], "tensorrt_llm::executor::decodingmode::usemaxlengthstop (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode16useMaxLengthStopEb", false]], "tensorrt_llm::executor::decodingmode::useminlength (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useMinLengthEb", false]], "tensorrt_llm::executor::decodingmode::useminp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode7useMinPEb", false]], "tensorrt_llm::executor::decodingmode::usenorepeatngramsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useNoRepeatNgramSizeEb", false]], "tensorrt_llm::executor::decodingmode::useoccurrencepenalties (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode22useOccurrencePenaltiesEb", false]], "tensorrt_llm::executor::decodingmode::usepresencepenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode18usePresencePenaltyEb", false]], "tensorrt_llm::executor::decodingmode::userepetitionpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useRepetitionPenaltyEb", false]], "tensorrt_llm::executor::decodingmode::usestopwords (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useStopWordsEb", false]], "tensorrt_llm::executor::decodingmode::usetemperature (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode14useTemperatureEb", false]], "tensorrt_llm::executor::decodingmode::usevariablebeamwidthsearch (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode26useVariableBeamWidthSearchEb", false]], "tensorrt_llm::executor::detail (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6detailE", false]], "tensorrt_llm::executor::detail::dimtype64 (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6detail9DimType64E", false]], "tensorrt_llm::executor::detail::ofitensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", false]], "tensorrt_llm::executor::detail::toitensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6detail9toITensorERK6Tensor", false]], "tensorrt_llm::executor::disagg_executor (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executorE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestratorE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::awaitcontextresponses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator21awaitContextResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::awaitgenerationresponses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator24awaitGenerationResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::canenqueue (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator10canEnqueueEv", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::disaggexecutororchestrator (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::enqueuecontext (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::enqueuegeneration (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::getcontextexecutors (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator19getContextExecutorsEv", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::getgenexecutors (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator15getGenExecutorsEv", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::mimpl (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator5mImplE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::~disaggexecutororchestrator (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestratorD0Ev", false]], "tensorrt_llm::executor::disagg_executor::responsewithid (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdE", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::gid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId3gidE", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERK14ResponseWithId", false], [0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERR14ResponseWithId", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::response (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId8responseE", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::responsewithid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERK14ResponseWithId", false], [0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERKN12tensorrt_llm8executor8ResponseE6IdType", false], [0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERR14ResponseWithId", false], [0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERRN12tensorrt_llm8executor8ResponseE6IdType", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::~responsewithid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdD0Ev", false]], "tensorrt_llm::executor::disservingrequeststats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStatsE", false]], "tensorrt_llm::executor::disservingrequeststats::kvcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStats11kvCacheSizeE", false]], "tensorrt_llm::executor::disservingrequeststats::kvcachetransferms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStats17kvCacheTransferMSE", false]], "tensorrt_llm::executor::dynamicbatchconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfigE", false]], "tensorrt_llm::executor::dynamicbatchconfig::dynamicbatchconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", false]], "tensorrt_llm::executor::dynamicbatchconfig::getbatchsizetable (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig17getBatchSizeTableEv", false]], "tensorrt_llm::executor::dynamicbatchconfig::getdynamicbatchmovingaveragewindow (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig34getDynamicBatchMovingAverageWindowEv", false]], "tensorrt_llm::executor::dynamicbatchconfig::getenablebatchsizetuning (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig24getEnableBatchSizeTuningEv", false]], "tensorrt_llm::executor::dynamicbatchconfig::getenablemaxnumtokenstuning (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig27getEnableMaxNumTokensTuningEv", false]], "tensorrt_llm::executor::dynamicbatchconfig::kdefaultbatchsizetable (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig22kDefaultBatchSizeTableE", false]], "tensorrt_llm::executor::dynamicbatchconfig::kdefaultdynamicbatchmovingaveragewindow (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig39kDefaultDynamicBatchMovingAverageWindowE", false]], "tensorrt_llm::executor::dynamicbatchconfig::mbatchsizetable (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig15mBatchSizeTableE", false]], "tensorrt_llm::executor::dynamicbatchconfig::mdynamicbatchmovingaveragewindow (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig32mDynamicBatchMovingAverageWindowE", false]], "tensorrt_llm::executor::dynamicbatchconfig::menablebatchsizetuning (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig22mEnableBatchSizeTuningE", false]], "tensorrt_llm::executor::dynamicbatchconfig::menablemaxnumtokenstuning (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig25mEnableMaxNumTokensTuningE", false]], "tensorrt_llm::executor::eaglechoices (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor12EagleChoicesE", false]], "tensorrt_llm::executor::eagleconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfigE", false]], "tensorrt_llm::executor::eagleconfig::checkposteriorvalue (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig19checkPosteriorValueERKNSt8optionalIfEE", false]], "tensorrt_llm::executor::eagleconfig::eagleconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::eagleconfig::getdynamictreemaxtopk (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig21getDynamicTreeMaxTopKEv", false]], "tensorrt_llm::executor::eagleconfig::geteaglechoices (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig15getEagleChoicesEv", false]], "tensorrt_llm::executor::eagleconfig::getposteriorthreshold (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig21getPosteriorThresholdEv", false]], "tensorrt_llm::executor::eagleconfig::isgreedysampling (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig16isGreedySamplingEv", false]], "tensorrt_llm::executor::eagleconfig::mdynamictreemaxtopk (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig19mDynamicTreeMaxTopKE", false]], "tensorrt_llm::executor::eagleconfig::meaglechoices (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig13mEagleChoicesE", false]], "tensorrt_llm::executor::eagleconfig::mgreedysampling (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig15mGreedySamplingE", false]], "tensorrt_llm::executor::eagleconfig::mposteriorthreshold (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig19mPosteriorThresholdE", false]], "tensorrt_llm::executor::eagleconfig::musedynamictree (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig15mUseDynamicTreeE", false]], "tensorrt_llm::executor::eagleconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfigeqERK11EagleConfig", false]], "tensorrt_llm::executor::eagleconfig::usedynamictree (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig14useDynamicTreeEv", false]], "tensorrt_llm::executor::executor (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8ExecutorE", false]], "tensorrt_llm::executor::executor::awaitresponses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERK6IdTypeRKNSt8optionalINSt6chrono12millisecondsEEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt6vectorI6IdTypeEERKNSt8optionalINSt6chrono12millisecondsEEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt8optionalINSt6chrono12millisecondsEEE", false]], "tensorrt_llm::executor::executor::cancelrequest (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor13cancelRequestE6IdType", false]], "tensorrt_llm::executor::executor::canenqueuerequests (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Executor18canEnqueueRequestsEv", false]], "tensorrt_llm::executor::executor::enqueuerequest (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor14enqueueRequestERK7Request", false]], "tensorrt_llm::executor::executor::enqueuerequests (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor15enqueueRequestsERKNSt6vectorI7RequestEE", false]], "tensorrt_llm::executor::executor::executor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEERK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK8Executor", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERR8Executor", false]], "tensorrt_llm::executor::executor::getkvcacheeventmanager (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Executor22getKVCacheEventManagerEv", false]], "tensorrt_llm::executor::executor::getlatestdebugtensors (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor21getLatestDebugTensorsEv", false]], "tensorrt_llm::executor::executor::getlatestiterationstats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor23getLatestIterationStatsEv", false]], "tensorrt_llm::executor::executor::getlatestrequeststats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor21getLatestRequestStatsEv", false]], "tensorrt_llm::executor::executor::getnumresponsesready (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Executor20getNumResponsesReadyERKNSt8optionalI6IdTypeEE", false]], "tensorrt_llm::executor::executor::isparticipant (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Executor13isParticipantEv", false]], "tensorrt_llm::executor::executor::mimpl (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor5mImplE", false]], "tensorrt_llm::executor::executor::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERK8Executor", false], [0, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERR8Executor", false]], "tensorrt_llm::executor::executor::shutdown (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor8shutdownEv", false]], "tensorrt_llm::executor::executor::~executor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8ExecutorD0Ev", false]], "tensorrt_llm::executor::executorconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfigE", false]], "tensorrt_llm::executor::executorconfig::executorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", false]], "tensorrt_llm::executor::executorconfig::getadditionalmodeloutputs (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getAdditionalModelOutputsEv", false]], "tensorrt_llm::executor::executorconfig::getbatchingtype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getBatchingTypeEv", false]], "tensorrt_llm::executor::executorconfig::getcachetransceiverconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getCacheTransceiverConfigEv", false]], "tensorrt_llm::executor::executorconfig::getdebugconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig14getDebugConfigEv", false]], "tensorrt_llm::executor::executorconfig::getdecodingconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig17getDecodingConfigEv", false]], "tensorrt_llm::executor::executorconfig::getenablechunkedcontext (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig23getEnableChunkedContextEv", false]], "tensorrt_llm::executor::executorconfig::getenabletrtoverlap (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig19getEnableTrtOverlapEv", false]], "tensorrt_llm::executor::executorconfig::getextendedruntimeperfknobconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig32getExtendedRuntimePerfKnobConfigEv", false]], "tensorrt_llm::executor::executorconfig::getgathergenerationlogits (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getGatherGenerationLogitsEv", false]], "tensorrt_llm::executor::executorconfig::getgpuweightspercent (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig20getGpuWeightsPercentEv", false]], "tensorrt_llm::executor::executorconfig::getguideddecodingconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig23getGuidedDecodingConfigEv", false]], "tensorrt_llm::executor::executorconfig::getiterstatsmaxiterations (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getIterStatsMaxIterationsEv", false]], "tensorrt_llm::executor::executorconfig::getkvcacheconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig16getKvCacheConfigEv", false]], "tensorrt_llm::executor::executorconfig::getkvcacheconfigref (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19getKvCacheConfigRefEv", false]], "tensorrt_llm::executor::executorconfig::getlogitspostprocessorconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig28getLogitsPostProcessorConfigEv", false]], "tensorrt_llm::executor::executorconfig::getmaxbatchsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxBatchSizeEv", false]], "tensorrt_llm::executor::executorconfig::getmaxbeamwidth (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxBeamWidthEv", false]], "tensorrt_llm::executor::executorconfig::getmaxnumtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxNumTokensEv", false]], "tensorrt_llm::executor::executorconfig::getmaxqueuesize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxQueueSizeEv", false]], "tensorrt_llm::executor::executorconfig::getmaxseqidlemicroseconds (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getMaxSeqIdleMicrosecondsEv", false]], "tensorrt_llm::executor::executorconfig::getnormalizelogprobs (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig20getNormalizeLogProbsEv", false]], "tensorrt_llm::executor::executorconfig::getparallelconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig17getParallelConfigEv", false]], "tensorrt_llm::executor::executorconfig::getpeftcacheconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig18getPeftCacheConfigEv", false]], "tensorrt_llm::executor::executorconfig::getprompttableoffloading (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig24getPromptTableOffloadingEv", false]], "tensorrt_llm::executor::executorconfig::getrecvpollperiodms (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig19getRecvPollPeriodMsEv", false]], "tensorrt_llm::executor::executorconfig::getrequeststatsmaxiterations (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig28getRequestStatsMaxIterationsEv", false]], "tensorrt_llm::executor::executorconfig::getschedulerconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig18getSchedulerConfigEv", false]], "tensorrt_llm::executor::executorconfig::getschedulerconfigref (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21getSchedulerConfigRefEv", false]], "tensorrt_llm::executor::executorconfig::getspecdecconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig16getSpecDecConfigEv", false]], "tensorrt_llm::executor::executorconfig::getusegpudirectstorage (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig22getUseGpuDirectStorageEv", false]], "tensorrt_llm::executor::executorconfig::kdefaultiterstatsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30kDefaultIterStatsMaxIterationsE", false]], "tensorrt_llm::executor::executorconfig::kdefaultmaxseqidlemicroseconds (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30kDefaultMaxSeqIdleMicrosecondsE", false]], "tensorrt_llm::executor::executorconfig::kdefaultrequeststatsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig33kDefaultRequestStatsMaxIterationsE", false]], "tensorrt_llm::executor::executorconfig::madditionalmodeloutputs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mAdditionalModelOutputsE", false]], "tensorrt_llm::executor::executorconfig::mbatchingtype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mBatchingTypeE", false]], "tensorrt_llm::executor::executorconfig::mcachetransceiverconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mCacheTransceiverConfigE", false]], "tensorrt_llm::executor::executorconfig::mdebugconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig12mDebugConfigE", false]], "tensorrt_llm::executor::executorconfig::mdecodingconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15mDecodingConfigE", false]], "tensorrt_llm::executor::executorconfig::menablechunkedcontext (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21mEnableChunkedContextE", false]], "tensorrt_llm::executor::executorconfig::menabletrtoverlap (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17mEnableTrtOverlapE", false]], "tensorrt_llm::executor::executorconfig::mextendedruntimeperfknobconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30mExtendedRuntimePerfKnobConfigE", false]], "tensorrt_llm::executor::executorconfig::mgathergenerationlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mGatherGenerationLogitsE", false]], "tensorrt_llm::executor::executorconfig::mgpuweightspercent (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18mGpuWeightsPercentE", false]], "tensorrt_llm::executor::executorconfig::mguideddecodingconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21mGuidedDecodingConfigE", false]], "tensorrt_llm::executor::executorconfig::miterstatsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mIterStatsMaxIterationsE", false]], "tensorrt_llm::executor::executorconfig::mkvcacheconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14mKvCacheConfigE", false]], "tensorrt_llm::executor::executorconfig::mlogitspostprocessorconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mLogitsPostProcessorConfigE", false]], "tensorrt_llm::executor::executorconfig::mmaxbatchsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxBatchSizeE", false]], "tensorrt_llm::executor::executorconfig::mmaxbeamwidth (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxBeamWidthE", false]], "tensorrt_llm::executor::executorconfig::mmaxnumtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxNumTokensE", false]], "tensorrt_llm::executor::executorconfig::mmaxqueuesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxQueueSizeE", false]], "tensorrt_llm::executor::executorconfig::mmaxseqidlemicroseconds (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mMaxSeqIdleMicrosecondsE", false]], "tensorrt_llm::executor::executorconfig::mnormalizelogprobs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18mNormalizeLogProbsE", false]], "tensorrt_llm::executor::executorconfig::mparallelconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15mParallelConfigE", false]], "tensorrt_llm::executor::executorconfig::mpeftcacheconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16mPeftCacheConfigE", false]], "tensorrt_llm::executor::executorconfig::mprompttableoffloading (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22mPromptTableOffloadingE", false]], "tensorrt_llm::executor::executorconfig::mrecvpollperiodms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17mRecvPollPeriodMsE", false]], "tensorrt_llm::executor::executorconfig::mrequeststatsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mRequestStatsMaxIterationsE", false]], "tensorrt_llm::executor::executorconfig::mschedulerconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16mSchedulerConfigE", false]], "tensorrt_llm::executor::executorconfig::mspeculativedecodingconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mSpeculativeDecodingConfigE", false]], "tensorrt_llm::executor::executorconfig::musegpudirectstorage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20mUseGpuDirectStorageE", false]], "tensorrt_llm::executor::executorconfig::setadditionalmodeloutputs (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setAdditionalModelOutputsERKNSt6vectorI21AdditionalModelOutputEE", false]], "tensorrt_llm::executor::executorconfig::setbatchingtype (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setBatchingTypeE12BatchingType", false]], "tensorrt_llm::executor::executorconfig::setcachetransceiverconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setCacheTransceiverConfigERK22CacheTransceiverConfig", false]], "tensorrt_llm::executor::executorconfig::setdebugconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14setDebugConfigERK11DebugConfig", false]], "tensorrt_llm::executor::executorconfig::setdecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setDecodingConfigERK14DecodingConfig", false]], "tensorrt_llm::executor::executorconfig::setenablechunkedcontext (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setEnableChunkedContextEb", false]], "tensorrt_llm::executor::executorconfig::setenabletrtoverlap (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setEnableTrtOverlapEb", false]], "tensorrt_llm::executor::executorconfig::setextendedruntimeperfknobconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig32setExtendedRuntimePerfKnobConfigERK29ExtendedRuntimePerfKnobConfig", false]], "tensorrt_llm::executor::executorconfig::setgathergenerationlogits (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setGatherGenerationLogitsEb", false]], "tensorrt_llm::executor::executorconfig::setgpuweightspercent (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setGpuWeightsPercentERKf", false]], "tensorrt_llm::executor::executorconfig::setguideddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setGuidedDecodingConfigERK20GuidedDecodingConfig", false]], "tensorrt_llm::executor::executorconfig::setiterstatsmaxiterations (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setIterStatsMaxIterationsE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setkvcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setKvCacheConfigERK13KvCacheConfig", false]], "tensorrt_llm::executor::executorconfig::setlogitspostprocessorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setLogitsPostProcessorConfigERK25LogitsPostProcessorConfig", false]], "tensorrt_llm::executor::executorconfig::setmaxbatchsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBatchSizeE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setmaxbeamwidth (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBeamWidthE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setmaxnumtokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxNumTokensE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setmaxqueuesize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxQueueSizeERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::executorconfig::setmaxseqidlemicroseconds (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setMaxSeqIdleMicrosecondsE8uint64_t", false]], "tensorrt_llm::executor::executorconfig::setnormalizelogprobs (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setNormalizeLogProbsEb", false]], "tensorrt_llm::executor::executorconfig::setparallelconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setParallelConfigERK14ParallelConfig", false]], "tensorrt_llm::executor::executorconfig::setpeftcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setPeftCacheConfigERK15PeftCacheConfig", false]], "tensorrt_llm::executor::executorconfig::setprompttableoffloading (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig24setPromptTableOffloadingEb", false]], "tensorrt_llm::executor::executorconfig::setrecvpollperiodms (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setRecvPollPeriodMsERK10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setrequeststatsmaxiterations (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setRequestStatsMaxIterationsE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setschedulerconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setSchedulerConfigERK15SchedulerConfig", false]], "tensorrt_llm::executor::executorconfig::setspecdecconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setSpecDecConfigERK25SpeculativeDecodingConfig", false]], "tensorrt_llm::executor::executorconfig::setusegpudirectstorage (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22setUseGpuDirectStorageERKb", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::extendedruntimeperfknobconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::getcudagraphcachesize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21getCudaGraphCacheSizeEv", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::getcudagraphmode (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16getCudaGraphModeEv", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::getenablecontextfmhafp32acc (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27getEnableContextFMHAFP32AccEv", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::getmultiblockmode (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17getMultiBlockModeEv", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::mcudagraphcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig19mCudaGraphCacheSizeE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::mcudagraphmode (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig14mCudaGraphModeE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::menablecontextfmhafp32acc (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig25mEnableContextFMHAFP32AccE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::mmultiblockmode (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig15mMultiBlockModeE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigeqERK29ExtendedRuntimePerfKnobConfig", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::setcudagraphcachesize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21setCudaGraphCacheSizeE10SizeType32", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::setcudagraphmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16setCudaGraphModeEb", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::setenablecontextfmhafp32acc (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27setEnableContextFMHAFP32AccEb", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::setmultiblockmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17setMultiBlockModeEb", false]], "tensorrt_llm::executor::externaldrafttokensconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfigE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::externaldrafttokensconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::getacceptancethreshold (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig22getAcceptanceThresholdEv", false]], "tensorrt_llm::executor::externaldrafttokensconfig::getfastlogits (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig13getFastLogitsEv", false]], "tensorrt_llm::executor::externaldrafttokensconfig::getlogits (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig9getLogitsEv", false]], "tensorrt_llm::executor::externaldrafttokensconfig::gettokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig9getTokensEv", false]], "tensorrt_llm::executor::externaldrafttokensconfig::macceptancethreshold (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig20mAcceptanceThresholdE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::mfastlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig11mFastLogitsE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::mlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig7mLogitsE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::mtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig7mTokensE", false]], "tensorrt_llm::executor::finishreason (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReasonE", false]], "tensorrt_llm::executor::finishreason::kcancelled (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason10kCANCELLEDE", false]], "tensorrt_llm::executor::finishreason::kend_id (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason7kEND_IDE", false]], "tensorrt_llm::executor::finishreason::klength (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason7kLENGTHE", false]], "tensorrt_llm::executor::finishreason::knot_finished (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason13kNOT_FINISHEDE", false]], "tensorrt_llm::executor::finishreason::kstop_words (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason11kSTOP_WORDSE", false]], "tensorrt_llm::executor::finishreason::ktimed_out (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason10kTIMED_OUTE", false]], "tensorrt_llm::executor::floattype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor9FloatTypeE", false]], "tensorrt_llm::executor::guideddecodingconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfigE", false]], "tensorrt_llm::executor::guideddecodingconfig::getbackend (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig10getBackendEv", false]], "tensorrt_llm::executor::guideddecodingconfig::getencodedvocab (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getEncodedVocabEv", false]], "tensorrt_llm::executor::guideddecodingconfig::getstoptokenids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getStopTokenIdsEv", false]], "tensorrt_llm::executor::guideddecodingconfig::gettokenizerstr (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getTokenizerStrEv", false]], "tensorrt_llm::executor::guideddecodingconfig::guideddecodingbackend (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackendE", false]], "tensorrt_llm::executor::guideddecodingconfig::guideddecodingbackend::kxgrammar (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend9kXGRAMMARE", false]], "tensorrt_llm::executor::guideddecodingconfig::guideddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", false]], "tensorrt_llm::executor::guideddecodingconfig::mbackend (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig8mBackendE", false]], "tensorrt_llm::executor::guideddecodingconfig::mencodedvocab (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mEncodedVocabE", false]], "tensorrt_llm::executor::guideddecodingconfig::mstoptokenids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mStopTokenIdsE", false]], "tensorrt_llm::executor::guideddecodingconfig::mtokenizerstr (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mTokenizerStrE", false]], "tensorrt_llm::executor::guideddecodingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfigeqERK20GuidedDecodingConfig", false]], "tensorrt_llm::executor::guideddecodingconfig::setbackend (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig10setBackendERK21GuidedDecodingBackend", false]], "tensorrt_llm::executor::guideddecodingconfig::setencodedvocab (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setEncodedVocabERKNSt6vectorINSt6stringEEE", false]], "tensorrt_llm::executor::guideddecodingconfig::setstoptokenids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setStopTokenIdsERKNSt6vectorI11TokenIdTypeEE", false]], "tensorrt_llm::executor::guideddecodingconfig::settokenizerstr (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setTokenizerStrERKNSt6stringE", false]], "tensorrt_llm::executor::guideddecodingconfig::validate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig8validateEv", false]], "tensorrt_llm::executor::guideddecodingparams (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParamsE", false]], "tensorrt_llm::executor::guideddecodingparams::getguide (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParams8getGuideEv", false]], "tensorrt_llm::executor::guideddecodingparams::getguidetype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParams12getGuideTypeEv", false]], "tensorrt_llm::executor::guideddecodingparams::guideddecodingparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams20GuidedDecodingParamsE9GuideTypeNSt8optionalINSt6stringEEE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideTypeE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kebnf_grammar (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType13kEBNF_GRAMMARE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kjson (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType5kJSONE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kjson_schema (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType12kJSON_SCHEMAE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kregex (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType6kREGEXE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kstructural_tag (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType15kSTRUCTURAL_TAGE", false]], "tensorrt_llm::executor::guideddecodingparams::mguide (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams6mGuideE", false]], "tensorrt_llm::executor::guideddecodingparams::mguidetype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams10mGuideTypeE", false]], "tensorrt_llm::executor::guideddecodingparams::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParamseqERK20GuidedDecodingParams", false]], "tensorrt_llm::executor::idtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6IdTypeE", false]], "tensorrt_llm::executor::inflightbatchingstats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStatsE", false]], "tensorrt_llm::executor::inflightbatchingstats::avgnumdecodedtokensperiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats26avgNumDecodedTokensPerIterE", false]], "tensorrt_llm::executor::inflightbatchingstats::microbatchid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats12microBatchIdE", false]], "tensorrt_llm::executor::inflightbatchingstats::numcontextrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats18numContextRequestsE", false]], "tensorrt_llm::executor::inflightbatchingstats::numctxtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats12numCtxTokensE", false]], "tensorrt_llm::executor::inflightbatchingstats::numgenrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats14numGenRequestsE", false]], "tensorrt_llm::executor::inflightbatchingstats::numpausedrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats17numPausedRequestsE", false]], "tensorrt_llm::executor::inflightbatchingstats::numscheduledrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats20numScheduledRequestsE", false]], "tensorrt_llm::executor::iterationstats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStatsE", false]], "tensorrt_llm::executor::iterationstats::cpumemusage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats11cpuMemUsageE", false]], "tensorrt_llm::executor::iterationstats::crosskvcachestats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats17crossKvCacheStatsE", false]], "tensorrt_llm::executor::iterationstats::gpumemusage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats11gpuMemUsageE", false]], "tensorrt_llm::executor::iterationstats::inflightbatchingstats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats21inflightBatchingStatsE", false]], "tensorrt_llm::executor::iterationstats::iter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats4iterE", false]], "tensorrt_llm::executor::iterationstats::iterlatencyms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats13iterLatencyMSE", false]], "tensorrt_llm::executor::iterationstats::kvcachestats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats12kvCacheStatsE", false]], "tensorrt_llm::executor::iterationstats::maxbatchsizeruntime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats19maxBatchSizeRuntimeE", false]], "tensorrt_llm::executor::iterationstats::maxbatchsizestatic (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats18maxBatchSizeStaticE", false]], "tensorrt_llm::executor::iterationstats::maxbatchsizetunerrecommended (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats28maxBatchSizeTunerRecommendedE", false]], "tensorrt_llm::executor::iterationstats::maxnumactiverequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats20maxNumActiveRequestsE", false]], "tensorrt_llm::executor::iterationstats::maxnumtokensruntime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats19maxNumTokensRuntimeE", false]], "tensorrt_llm::executor::iterationstats::maxnumtokensstatic (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats18maxNumTokensStaticE", false]], "tensorrt_llm::executor::iterationstats::maxnumtokenstunerrecommended (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats28maxNumTokensTunerRecommendedE", false]], "tensorrt_llm::executor::iterationstats::newactiverequestsqueuelatencyms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats31newActiveRequestsQueueLatencyMSE", false]], "tensorrt_llm::executor::iterationstats::numactiverequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats17numActiveRequestsE", false]], "tensorrt_llm::executor::iterationstats::numcompletedrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats20numCompletedRequestsE", false]], "tensorrt_llm::executor::iterationstats::numnewactiverequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats20numNewActiveRequestsE", false]], "tensorrt_llm::executor::iterationstats::numqueuedrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats17numQueuedRequestsE", false]], "tensorrt_llm::executor::iterationstats::pinnedmemusage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats14pinnedMemUsageE", false]], "tensorrt_llm::executor::iterationstats::staticbatchingstats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats19staticBatchingStatsE", false]], "tensorrt_llm::executor::iterationstats::timestamp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats9timestampE", false]], "tensorrt_llm::executor::iterationtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor13IterationTypeE", false]], "tensorrt_llm::executor::jsonserialization (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor17JsonSerializationE", false]], "tensorrt_llm::executor::jsonserialization::tojsonstr (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK12RequestStats", false], [0, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK14IterationStats", false], [0, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK24RequestStatsPerIteration", false]], "tensorrt_llm::executor::kv_cache (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cacheE", false]], "tensorrt_llm::executor::kv_cache::cachestate (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheStateE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentionconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentionconfig::attentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig15AttentionConfigE13AttentionTypei", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentionconfig::mattentiontype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig14mAttentionTypeE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentionconfig::mkvfactor (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig9mKvFactorE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentiontype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionTypeE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentiontype::kdefault (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionType8kDEFAULTE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentiontype::kmla (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionType4kMLAE", false]], "tensorrt_llm::executor::kv_cache::cachestate::cachestate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", false]], "tensorrt_llm::executor::kv_cache::cachestate::getattentionconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState18getAttentionConfigEv", false]], "tensorrt_llm::executor::kv_cache::cachestate::getdatatype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11getDataTypeEv", false]], "tensorrt_llm::executor::kv_cache::cachestate::getmodelconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14getModelConfigEv", false]], "tensorrt_llm::executor::kv_cache::cachestate::getparallelconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState17getParallelConfigEv", false]], "tensorrt_llm::executor::kv_cache::cachestate::mattentionconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState16mAttentionConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::mdatatype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState9mDataTypeE", false]], "tensorrt_llm::executor::kv_cache::cachestate::mmodelconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState12mModelConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig::mnbkvheadsperlayer (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig18mNbKvHeadsPerLayerE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig::msizeperhead (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig12mSizePerHeadE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig::mtokensperblock (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig15mTokensPerBlockE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigeqERK11ModelConfig", false]], "tensorrt_llm::executor::kv_cache::cachestate::mparallelconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15mParallelConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheStateeqERKN8kv_cache10CacheStateE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::mdprank (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig7mDPrankE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::mdpsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig7mDPsizeE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::menableattentiondp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig18mEnableAttentionDPE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::mpipelineparallelism (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig20mPipelineParallelismE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::mtensorparallelism (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig18mTensorParallelismE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigeqERK14ParallelConfig", false]], "tensorrt_llm::executor::kv_cache::cachestate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState8toStringEv", false]], "tensorrt_llm::executor::kv_cache::commstate (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommStateE", false]], "tensorrt_llm::executor::kv_cache::commstate::commstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::getmpistate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::getselfidx (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv", false]], "tensorrt_llm::executor::kv_cache::commstate::getsocketstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState14getSocketStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::ismpistate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::issocketstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13isSocketStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::mselfidx (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState8mSelfIdxE", false]], "tensorrt_llm::executor::kv_cache::commstate::mstate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState6mStateE", false]], "tensorrt_llm::executor::kv_cache::commstate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommStateeqERK9CommState", false]], "tensorrt_llm::executor::kv_cache::commstate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState8toStringEv", false]], "tensorrt_llm::executor::kv_cache::connection (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10ConnectionE", false]], "tensorrt_llm::executor::kv_cache::connection::isthreadsafe (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection12isThreadSafeEv", false]], "tensorrt_llm::executor::kv_cache::connection::recv (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", false]], "tensorrt_llm::executor::kv_cache::connection::send (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", false]], "tensorrt_llm::executor::kv_cache::connection::~connection (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10ConnectionD0Ev", false]], "tensorrt_llm::executor::kv_cache::connectionmanager (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManagerE", false]], "tensorrt_llm::executor::kv_cache::connectionmanager::getcommstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache17ConnectionManager12getCommStateEv", false]], "tensorrt_llm::executor::kv_cache::connectionmanager::getconnections (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager14getConnectionsERK9CommState", false]], "tensorrt_llm::executor::kv_cache::connectionmanager::recvconnect (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", false]], "tensorrt_llm::executor::kv_cache::connectionmanager::~connectionmanager (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManagerD0Ev", false]], "tensorrt_llm::executor::kv_cache::datacontext (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContextE", false]], "tensorrt_llm::executor::kv_cache::datacontext::datacontext (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext11DataContextEi", false]], "tensorrt_llm::executor::kv_cache::datacontext::gettag (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache11DataContext6getTagEv", false]], "tensorrt_llm::executor::kv_cache::datacontext::mtag (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext4mTagE", false]], "tensorrt_llm::executor::kv_cache::mpistate (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache8MpiStateE", false]], "tensorrt_llm::executor::kv_cache::mpistate::mranks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache8MpiState6mRanksE", false]], "tensorrt_llm::executor::kv_cache::mpistate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiStateeqERK8MpiState", false]], "tensorrt_llm::executor::kv_cache::mpistate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiState8toStringEv", false]], "tensorrt_llm::executor::kv_cache::socketstate (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketStateE", false]], "tensorrt_llm::executor::kv_cache::socketstate::mip (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketState3mIpE", false]], "tensorrt_llm::executor::kv_cache::socketstate::mport (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketState5mPortE", false]], "tensorrt_llm::executor::kv_cache::socketstate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketStateeqERK11SocketState", false]], "tensorrt_llm::executor::kv_cache::socketstate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketState8toStringEv", false]], "tensorrt_llm::executor::kvcacheconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfigE", false]], "tensorrt_llm::executor::kvcacheconfig::fillemptyfieldsfromruntimedefaults (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig34fillEmptyFieldsFromRuntimeDefaultsEN12tensorrt_llm7runtime15RuntimeDefaultsE", false]], "tensorrt_llm::executor::kvcacheconfig::getcopyonpartialreuse (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getCopyOnPartialReuseEv", false]], "tensorrt_llm::executor::kvcacheconfig::getcrosskvcachefraction (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig23getCrossKvCacheFractionEv", false]], "tensorrt_llm::executor::kvcacheconfig::getenableblockreuse (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig19getEnableBlockReuseEv", false]], "tensorrt_llm::executor::kvcacheconfig::getenablepartialreuse (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getEnablePartialReuseEv", false]], "tensorrt_llm::executor::kvcacheconfig::geteventbuffermaxsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getEventBufferMaxSizeEv", false]], "tensorrt_llm::executor::kvcacheconfig::getfreegpumemoryfraction (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig24getFreeGpuMemoryFractionEv", false]], "tensorrt_llm::executor::kvcacheconfig::gethostcachesize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig16getHostCacheSizeEv", false]], "tensorrt_llm::executor::kvcacheconfig::getmaxattentionwindowvec (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig24getMaxAttentionWindowVecEv", false]], "tensorrt_llm::executor::kvcacheconfig::getmaxtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig12getMaxTokensEv", false]], "tensorrt_llm::executor::kvcacheconfig::getonboardblocks (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig16getOnboardBlocksEv", false]], "tensorrt_llm::executor::kvcacheconfig::getsecondaryoffloadminpriority (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig30getSecondaryOffloadMinPriorityEv", false]], "tensorrt_llm::executor::kvcacheconfig::getsinktokenlength (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig18getSinkTokenLengthEv", false]], "tensorrt_llm::executor::kvcacheconfig::kvcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", false]], "tensorrt_llm::executor::kvcacheconfig::mcopyonpartialreuse (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mCopyOnPartialReuseE", false]], "tensorrt_llm::executor::kvcacheconfig::mcrosskvcachefraction (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21mCrossKvCacheFractionE", false]], "tensorrt_llm::executor::kvcacheconfig::menableblockreuse (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig17mEnableBlockReuseE", false]], "tensorrt_llm::executor::kvcacheconfig::menablepartialreuse (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mEnablePartialReuseE", false]], "tensorrt_llm::executor::kvcacheconfig::meventbuffermaxsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mEventBufferMaxSizeE", false]], "tensorrt_llm::executor::kvcacheconfig::mfreegpumemoryfraction (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig22mFreeGpuMemoryFractionE", false]], "tensorrt_llm::executor::kvcacheconfig::mhostcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig14mHostCacheSizeE", false]], "tensorrt_llm::executor::kvcacheconfig::mmaxattentionwindowvec (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig22mMaxAttentionWindowVecE", false]], "tensorrt_llm::executor::kvcacheconfig::mmaxtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig10mMaxTokensE", false]], "tensorrt_llm::executor::kvcacheconfig::monboardblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig14mOnboardBlocksE", false]], "tensorrt_llm::executor::kvcacheconfig::msecondaryoffloadminpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig28mSecondaryOffloadMinPriorityE", false]], "tensorrt_llm::executor::kvcacheconfig::msinktokenlength (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16mSinkTokenLengthE", false]], "tensorrt_llm::executor::kvcacheconfig::setcopyonpartialreuse (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setCopyOnPartialReuseEb", false]], "tensorrt_llm::executor::kvcacheconfig::setcrosskvcachefraction (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig23setCrossKvCacheFractionE9FloatType", false]], "tensorrt_llm::executor::kvcacheconfig::setenableblockreuse (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19setEnableBlockReuseEb", false]], "tensorrt_llm::executor::kvcacheconfig::setenablepartialreuse (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEnablePartialReuseEb", false]], "tensorrt_llm::executor::kvcacheconfig::seteventbuffermaxsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEventBufferMaxSizeE6size_t", false]], "tensorrt_llm::executor::kvcacheconfig::setfreegpumemoryfraction (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setFreeGpuMemoryFractionE9FloatType", false]], "tensorrt_llm::executor::kvcacheconfig::sethostcachesize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setHostCacheSizeE6size_t", false]], "tensorrt_llm::executor::kvcacheconfig::setmaxattentionwindowvec (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setMaxAttentionWindowVecENSt6vectorI10SizeType32EE", false]], "tensorrt_llm::executor::kvcacheconfig::setmaxtokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig12setMaxTokensE10SizeType32", false]], "tensorrt_llm::executor::kvcacheconfig::setonboardblocks (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setOnboardBlocksEb", false]], "tensorrt_llm::executor::kvcacheconfig::setsecondaryoffloadminpriority (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig30setSecondaryOffloadMinPriorityENSt8optionalI17RetentionPriorityEE", false]], "tensorrt_llm::executor::kvcacheconfig::setsinktokenlength (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig18setSinkTokenLengthE10SizeType32", false]], "tensorrt_llm::executor::kvcachecreateddata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheCreatedDataE", false]], "tensorrt_llm::executor::kvcachecreateddata::numblockspercachelevel (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheCreatedData22numBlocksPerCacheLevelE", false]], "tensorrt_llm::executor::kvcacheevent (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor12KVCacheEventE", false]], "tensorrt_llm::executor::kvcacheevent::data (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent4dataE", false]], "tensorrt_llm::executor::kvcacheevent::eventid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent7eventIdE", false]], "tensorrt_llm::executor::kvcacheevent::kvcacheevent (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent12KVCacheEventE6IdType16KVCacheEventData", false]], "tensorrt_llm::executor::kvcacheeventdata (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDataE", false]], "tensorrt_llm::executor::kvcacheeventdiff (c++ struct)": [[0, "_CPPv4I0EN12tensorrt_llm8executor16KVCacheEventDiffE", false]], "tensorrt_llm::executor::kvcacheeventdiff::newvalue (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDiff8newValueE", false]], "tensorrt_llm::executor::kvcacheeventdiff::oldvalue (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDiff8oldValueE", false]], "tensorrt_llm::executor::kvcacheeventmanager (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManagerE", false]], "tensorrt_llm::executor::kvcacheeventmanager::getlatestevents (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager15getLatestEventsENSt8optionalINSt6chrono12millisecondsEEE", false]], "tensorrt_llm::executor::kvcacheeventmanager::kvcacheeventmanager (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager19KVCacheEventManagerENSt10shared_ptrIN12tensorrt_llm13batch_manager16kv_cache_manager18BaseKVCacheManagerEEE", false]], "tensorrt_llm::executor::kvcacheeventmanager::kvcachemanager (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager14kvCacheManagerE", false]], "tensorrt_llm::executor::kvcacheremoveddata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheRemovedDataE", false]], "tensorrt_llm::executor::kvcacheremoveddata::blockhashes (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheRemovedData11blockHashesE", false]], "tensorrt_llm::executor::kvcacheretentionconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfigE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::getdecodedurationms (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig19getDecodeDurationMsEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::getdecoderetentionpriority (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig26getDecodeRetentionPriorityEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::getperblockretentionpriorityduration (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32", false]], "tensorrt_llm::executor::kvcacheretentionconfig::gettokenrangeretentionconfigs (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig29getTokenRangeRetentionConfigsEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::kdefaultretentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25kDefaultRetentionPriorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::kmaxretentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMaxRetentionPriorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::kminretentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMinRetentionPriorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::kvcacheretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", false], [0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::mdecodedurationms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig17mDecodeDurationMsE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::mdecoderetentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig24mDecodeRetentionPriorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::mtokenrangeretentionconfigs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig27mTokenRangeRetentionConfigsE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfigeqERK22KvCacheRetentionConfig", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::durationms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig10durationMsE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigeqERK25TokenRangeRetentionConfig", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::priority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig8priorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::tokenend (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig8tokenEndE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::tokenrangeretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::tokenstart (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig10tokenStartE", false]], "tensorrt_llm::executor::kvcachestats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStatsE", false]], "tensorrt_llm::executor::kvcachestats::allocnewblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats14allocNewBlocksE", false]], "tensorrt_llm::executor::kvcachestats::alloctotalblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats16allocTotalBlocksE", false]], "tensorrt_llm::executor::kvcachestats::cachehitrate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12cacheHitRateE", false]], "tensorrt_llm::executor::kvcachestats::freenumblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats13freeNumBlocksE", false]], "tensorrt_llm::executor::kvcachestats::maxnumblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12maxNumBlocksE", false]], "tensorrt_llm::executor::kvcachestats::missedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12missedBlocksE", false]], "tensorrt_llm::executor::kvcachestats::reusedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12reusedBlocksE", false]], "tensorrt_llm::executor::kvcachestats::tokensperblock (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats14tokensPerBlockE", false]], "tensorrt_llm::executor::kvcachestats::usednumblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats13usedNumBlocksE", false]], "tensorrt_llm::executor::kvcachestoredblockdata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockDataE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::blockhash (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData9blockHashE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::cachelevel (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData10cacheLevelE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::kvcachestoredblockdata (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", false]], "tensorrt_llm::executor::kvcachestoredblockdata::loraid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData6loraIdE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::priority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData8priorityE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::tokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData6tokensE", false]], "tensorrt_llm::executor::kvcachestoreddata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredDataE", false]], "tensorrt_llm::executor::kvcachestoreddata::blocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredData6blocksE", false]], "tensorrt_llm::executor::kvcachestoreddata::parenthash (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredData10parentHashE", false]], "tensorrt_llm::executor::kvcacheupdateddata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedDataE", false]], "tensorrt_llm::executor::kvcacheupdateddata::blockhash (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData9blockHashE", false]], "tensorrt_llm::executor::kvcacheupdateddata::cachelevel (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData10cacheLevelE", false]], "tensorrt_llm::executor::kvcacheupdateddata::cachelevelupdated (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData17cacheLevelUpdatedE10SizeType3210SizeType32", false]], "tensorrt_llm::executor::kvcacheupdateddata::kvcacheupdateddata (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData18KVCacheUpdatedDataE6IdType", false]], "tensorrt_llm::executor::kvcacheupdateddata::priority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData8priorityE", false]], "tensorrt_llm::executor::kvcacheupdateddata::priorityupdated (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData15priorityUpdatedE10SizeType3210SizeType32", false]], "tensorrt_llm::executor::logitspostprocessor (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor19LogitsPostProcessorE", false]], "tensorrt_llm::executor::logitspostprocessorbatched (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor26LogitsPostProcessorBatchedE", false]], "tensorrt_llm::executor::logitspostprocessorconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfigE", false]], "tensorrt_llm::executor::logitspostprocessorconfig::getprocessorbatched (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig19getProcessorBatchedEv", false]], "tensorrt_llm::executor::logitspostprocessorconfig::getprocessormap (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig15getProcessorMapEv", false]], "tensorrt_llm::executor::logitspostprocessorconfig::getreplicate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig12getReplicateEv", false]], "tensorrt_llm::executor::logitspostprocessorconfig::logitspostprocessorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", false]], "tensorrt_llm::executor::logitspostprocessorconfig::mprocessorbatched (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig17mProcessorBatchedE", false]], "tensorrt_llm::executor::logitspostprocessorconfig::mprocessormap (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig13mProcessorMapE", false]], "tensorrt_llm::executor::logitspostprocessorconfig::mreplicate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig10mReplicateE", false]], "tensorrt_llm::executor::logitspostprocessorconfig::setprocessorbatched (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig19setProcessorBatchedERK26LogitsPostProcessorBatched", false]], "tensorrt_llm::executor::logitspostprocessorconfig::setprocessormap (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig15setProcessorMapERK22LogitsPostProcessorMap", false]], "tensorrt_llm::executor::logitspostprocessorconfig::setreplicate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig12setReplicateEb", false]], "tensorrt_llm::executor::logitspostprocessormap (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor22LogitsPostProcessorMapE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfigE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::calculatespeculativeresource (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig28calculateSpeculativeResourceEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::calculatespeculativeresourcetuple (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::get (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig3getEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::getngramsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig12getNgramSizeEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::getverificationsetsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig22getVerificationSetSizeEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::getwindowsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig13getWindowSizeEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::isle (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig4isLEERK23LookaheadDecodingConfig", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::islegal (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::kdefaultlookaheaddecodingngram (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig30kDefaultLookaheadDecodingNgramE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::kdefaultlookaheaddecodingverificationset (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig40kDefaultLookaheadDecodingVerificationSetE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::kdefaultlookaheaddecodingwindow (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig31kDefaultLookaheadDecodingWindowE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::lookaheaddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", false], [0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::mngramsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig10mNgramSizeE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::mverificationsetsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig20mVerificationSetSizeE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::mwindowsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig11mWindowSizeE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfigeqERK23LookaheadDecodingConfig", false]], "tensorrt_llm::executor::loraconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfigE", false]], "tensorrt_llm::executor::loraconfig::getconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor10LoraConfig9getConfigEv", false]], "tensorrt_llm::executor::loraconfig::gettaskid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor10LoraConfig9getTaskIdEv", false]], "tensorrt_llm::executor::loraconfig::getweights (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor10LoraConfig10getWeightsEv", false]], "tensorrt_llm::executor::loraconfig::loraconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", false]], "tensorrt_llm::executor::loraconfig::mconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfig7mConfigE", false]], "tensorrt_llm::executor::loraconfig::mtaskid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfig7mTaskIdE", false]], "tensorrt_llm::executor::loraconfig::mweights (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfig8mWeightsE", false]], "tensorrt_llm::executor::medusachoices (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor13MedusaChoicesE", false]], "tensorrt_llm::executor::memorytype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryTypeE", false]], "tensorrt_llm::executor::memorytype::kcpu (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType4kCPUE", false]], "tensorrt_llm::executor::memorytype::kcpu_pinned (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType11kCPU_PINNEDE", false]], "tensorrt_llm::executor::memorytype::kcpu_pinnedpool (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType15kCPU_PINNEDPOOLE", false]], "tensorrt_llm::executor::memorytype::kgpu (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType4kGPUE", false]], "tensorrt_llm::executor::memorytype::kunknown (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType8kUNKNOWNE", false]], "tensorrt_llm::executor::memorytype::kuvm (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType4kUVME", false]], "tensorrt_llm::executor::millisecondstype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor16MillisecondsTypeE", false]], "tensorrt_llm::executor::modeltype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor9ModelTypeE", false]], "tensorrt_llm::executor::modeltype::kdecoder_only (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor9ModelType13kDECODER_ONLYE", false]], "tensorrt_llm::executor::modeltype::kencoder_decoder (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor9ModelType16kENCODER_DECODERE", false]], "tensorrt_llm::executor::modeltype::kencoder_only (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor9ModelType13kENCODER_ONLYE", false]], "tensorrt_llm::executor::mropeconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor11MropeConfigE", false]], "tensorrt_llm::executor::mropeconfig::getmropepositiondeltas (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11MropeConfig22getMRopePositionDeltasEv", false]], "tensorrt_llm::executor::mropeconfig::getmroperotarycossin (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11MropeConfig20getMRopeRotaryCosSinEv", false]], "tensorrt_llm::executor::mropeconfig::mmropepositiondeltas (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11MropeConfig20mMRopePositionDeltasE", false]], "tensorrt_llm::executor::mropeconfig::mmroperotarycossin (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11MropeConfig18mMRopeRotaryCosSinE", false]], "tensorrt_llm::executor::mropeconfig::mropeconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11MropeConfig11MropeConfigE6Tensor10SizeType32", false]], "tensorrt_llm::executor::operator<< (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE21ContextChunkingPolicy", false], [0, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE23CapacitySchedulerPolicy", false]], "tensorrt_llm::executor::orchestratorconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfigE", false]], "tensorrt_llm::executor::orchestratorconfig::getisorchestrator (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getIsOrchestratorEv", false]], "tensorrt_llm::executor::orchestratorconfig::getorchleadercomm (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getOrchLeaderCommEv", false]], "tensorrt_llm::executor::orchestratorconfig::getspawnprocesses (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getSpawnProcessesEv", false]], "tensorrt_llm::executor::orchestratorconfig::getworkerexecutablepath (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig23getWorkerExecutablePathEv", false]], "tensorrt_llm::executor::orchestratorconfig::misorchestrator (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mIsOrchestratorE", false]], "tensorrt_llm::executor::orchestratorconfig::morchleadercomm (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mOrchLeaderCommE", false]], "tensorrt_llm::executor::orchestratorconfig::mspawnprocesses (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mSpawnProcessesE", false]], "tensorrt_llm::executor::orchestratorconfig::mworkerexecutablepath (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig21mWorkerExecutablePathE", false]], "tensorrt_llm::executor::orchestratorconfig::orchestratorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", false]], "tensorrt_llm::executor::orchestratorconfig::setisorchestrator (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setIsOrchestratorEb", false]], "tensorrt_llm::executor::orchestratorconfig::setorchleadercomm (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setOrchLeaderCommERKNSt10shared_ptrIN3mpi7MpiCommEEE", false]], "tensorrt_llm::executor::orchestratorconfig::setspawnprocesses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setSpawnProcessesEb", false]], "tensorrt_llm::executor::orchestratorconfig::setworkerexecutablepath (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig23setWorkerExecutablePathERKNSt6stringE", false]], "tensorrt_llm::executor::outputconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfigE", false]], "tensorrt_llm::executor::outputconfig::additionalmodeloutputs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig22additionalModelOutputsE", false]], "tensorrt_llm::executor::outputconfig::excludeinputfromoutput (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig22excludeInputFromOutputE", false]], "tensorrt_llm::executor::outputconfig::outputconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", false]], "tensorrt_llm::executor::outputconfig::returncontextlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig19returnContextLogitsE", false]], "tensorrt_llm::executor::outputconfig::returnencoderoutput (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig19returnEncoderOutputE", false]], "tensorrt_llm::executor::outputconfig::returngenerationlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig22returnGenerationLogitsE", false]], "tensorrt_llm::executor::outputconfig::returnlogprobs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig14returnLogProbsE", false]], "tensorrt_llm::executor::outputconfig::returnperfmetrics (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig17returnPerfMetricsE", false]], "tensorrt_llm::executor::parallelconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfigE", false]], "tensorrt_llm::executor::parallelconfig::getcommunicationmode (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig20getCommunicationModeEv", false]], "tensorrt_llm::executor::parallelconfig::getcommunicationtype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig20getCommunicationTypeEv", false]], "tensorrt_llm::executor::parallelconfig::getdeviceids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig12getDeviceIdsEv", false]], "tensorrt_llm::executor::parallelconfig::getnumnodes (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig11getNumNodesEv", false]], "tensorrt_llm::executor::parallelconfig::getorchestratorconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig21getOrchestratorConfigEv", false]], "tensorrt_llm::executor::parallelconfig::getparticipantids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig17getParticipantIdsEv", false]], "tensorrt_llm::executor::parallelconfig::mcommmode (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mCommModeE", false]], "tensorrt_llm::executor::parallelconfig::mcommtype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mCommTypeE", false]], "tensorrt_llm::executor::parallelconfig::mdeviceids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig10mDeviceIdsE", false]], "tensorrt_llm::executor::parallelconfig::mnumnodes (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mNumNodesE", false]], "tensorrt_llm::executor::parallelconfig::morchestratorconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig19mOrchestratorConfigE", false]], "tensorrt_llm::executor::parallelconfig::mparticipantids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig15mParticipantIdsE", false]], "tensorrt_llm::executor::parallelconfig::parallelconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::parallelconfig::setcommunicationmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationModeE17CommunicationMode", false]], "tensorrt_llm::executor::parallelconfig::setcommunicationtype (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationTypeE17CommunicationType", false]], "tensorrt_llm::executor::parallelconfig::setdeviceids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig12setDeviceIdsERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::executor::parallelconfig::setnumnodes (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig11setNumNodesE10SizeType32", false]], "tensorrt_llm::executor::parallelconfig::setorchestratorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig21setOrchestratorConfigERK18OrchestratorConfig", false]], "tensorrt_llm::executor::parallelconfig::setparticipantids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig17setParticipantIdsERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::executor::peftcacheconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfigE", false]], "tensorrt_llm::executor::peftcacheconfig::getdevicecachepercent (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getDeviceCachePercentEv", false]], "tensorrt_llm::executor::peftcacheconfig::gethostcachesize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig16getHostCacheSizeEv", false]], "tensorrt_llm::executor::peftcacheconfig::getloraprefetchdir (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig18getLoraPrefetchDirEv", false]], "tensorrt_llm::executor::peftcacheconfig::getmaxadaptersize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig17getMaxAdapterSizeEv", false]], "tensorrt_llm::executor::peftcacheconfig::getmaxpagesperblockdevice (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig25getMaxPagesPerBlockDeviceEv", false]], "tensorrt_llm::executor::peftcacheconfig::getmaxpagesperblockhost (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig23getMaxPagesPerBlockHostEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumcopystreams (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig17getNumCopyStreamsEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumdevicemodulelayer (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig23getNumDeviceModuleLayerEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumensureworkers (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig19getNumEnsureWorkersEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumhostmodulelayer (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getNumHostModuleLayerEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumputworkers (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig16getNumPutWorkersEv", false]], "tensorrt_llm::executor::peftcacheconfig::getoptimaladaptersize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getOptimalAdapterSizeEv", false]], "tensorrt_llm::executor::peftcacheconfig::kdefaultmaxadaptersize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig22kDefaultMaxAdapterSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::kdefaultmaxpagesperblockdevice (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig30kDefaultMaxPagesPerBlockDeviceE", false]], "tensorrt_llm::executor::peftcacheconfig::kdefaultmaxpagesperblockhost (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig28kDefaultMaxPagesPerBlockHostE", false]], "tensorrt_llm::executor::peftcacheconfig::kdefaultoptimaladaptersize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig26kDefaultOptimalAdapterSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::mdevicecachepercent (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mDeviceCachePercentE", false]], "tensorrt_llm::executor::peftcacheconfig::mhostcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig14mHostCacheSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::mloraprefetchdir (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig16mLoraPrefetchDirE", false]], "tensorrt_llm::executor::peftcacheconfig::mmaxadaptersize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15mMaxAdapterSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::mmaxpagesperblockdevice (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig23mMaxPagesPerBlockDeviceE", false]], "tensorrt_llm::executor::peftcacheconfig::mmaxpagesperblockhost (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig21mMaxPagesPerBlockHostE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumcopystreams (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15mNumCopyStreamsE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumdevicemodulelayer (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig21mNumDeviceModuleLayerE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumensureworkers (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig17mNumEnsureWorkersE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumhostmodulelayer (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mNumHostModuleLayerE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumputworkers (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig14mNumPutWorkersE", false]], "tensorrt_llm::executor::peftcacheconfig::moptimaladaptersize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mOptimalAdapterSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfigeqERK15PeftCacheConfig", false]], "tensorrt_llm::executor::peftcacheconfig::peftcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", false]], "tensorrt_llm::executor::prioritytype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor12PriorityTypeE", false]], "tensorrt_llm::executor::prompttuningconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfigE", false]], "tensorrt_llm::executor::prompttuningconfig::getembeddingtable (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18PromptTuningConfig17getEmbeddingTableEv", false]], "tensorrt_llm::executor::prompttuningconfig::getinputtokenextraids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18PromptTuningConfig21getInputTokenExtraIdsEv", false]], "tensorrt_llm::executor::prompttuningconfig::membeddingtable (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig15mEmbeddingTableE", false]], "tensorrt_llm::executor::prompttuningconfig::minputtokenextraids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig19mInputTokenExtraIdsE", false]], "tensorrt_llm::executor::prompttuningconfig::prompttuningconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig18PromptTuningConfigE6TensorNSt8optionalI16VecTokenExtraIdsEE", false]], "tensorrt_llm::executor::randomseedtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor14RandomSeedTypeE", false]], "tensorrt_llm::executor::request (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor7RequestE", false]], "tensorrt_llm::executor::request::getadditionaloutputnames (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request24getAdditionalOutputNamesEv", false]], "tensorrt_llm::executor::request::getallottedtimems (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request17getAllottedTimeMsEv", false]], "tensorrt_llm::executor::request::getbadwords (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request11getBadWordsEv", false]], "tensorrt_llm::executor::request::getclientid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request11getClientIdEv", false]], "tensorrt_llm::executor::request::getcontextphaseparams (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request21getContextPhaseParamsEv", false]], "tensorrt_llm::executor::request::getcrossattentionmask (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request21getCrossAttentionMaskEv", false]], "tensorrt_llm::executor::request::geteagleconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request14getEagleConfigEv", false]], "tensorrt_llm::executor::request::getembeddingbias (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request16getEmbeddingBiasEv", false]], "tensorrt_llm::executor::request::getencoderinputfeatures (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request23getEncoderInputFeaturesEv", false]], "tensorrt_llm::executor::request::getencoderinputtokenids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request23getEncoderInputTokenIdsEv", false]], "tensorrt_llm::executor::request::getencoderoutputlength (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request22getEncoderOutputLengthEv", false]], "tensorrt_llm::executor::request::getendid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request8getEndIdEv", false]], "tensorrt_llm::executor::request::getexternaldrafttokensconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request28getExternalDraftTokensConfigEv", false]], "tensorrt_llm::executor::request::getguideddecodingparams (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request23getGuidedDecodingParamsEv", false]], "tensorrt_llm::executor::request::getinputtokenids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request16getInputTokenIdsEv", false]], "tensorrt_llm::executor::request::getkvcacheretentionconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request25getKvCacheRetentionConfigEv", false]], "tensorrt_llm::executor::request::getlanguageadapteruid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request21getLanguageAdapterUidEv", false]], "tensorrt_llm::executor::request::getlogitspostprocessor (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request22getLogitsPostProcessorEv", false]], "tensorrt_llm::executor::request::getlogitspostprocessorname (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request26getLogitsPostProcessorNameEv", false]], "tensorrt_llm::executor::request::getlookaheadconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request18getLookaheadConfigEv", false]], "tensorrt_llm::executor::request::getloraconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request13getLoraConfigEv", false]], "tensorrt_llm::executor::request::getmaxtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request12getMaxTokensEv", false]], "tensorrt_llm::executor::request::getmropeconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request14getMropeConfigEv", false]], "tensorrt_llm::executor::request::getmultimodalembedding (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request22getMultimodalEmbeddingEv", false]], "tensorrt_llm::executor::request::getoutputconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request15getOutputConfigEv", false]], "tensorrt_llm::executor::request::getpadid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request8getPadIdEv", false]], "tensorrt_llm::executor::request::getpositionids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request14getPositionIdsEv", false]], "tensorrt_llm::executor::request::getpriority (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request11getPriorityEv", false]], "tensorrt_llm::executor::request::getprompttuningconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request21getPromptTuningConfigEv", false]], "tensorrt_llm::executor::request::getrequesttype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request14getRequestTypeEv", false]], "tensorrt_llm::executor::request::getreturnallgeneratedtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request27getReturnAllGeneratedTokensEv", false]], "tensorrt_llm::executor::request::getsamplingconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request17getSamplingConfigEv", false]], "tensorrt_llm::executor::request::getskipcrossattnblocks (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request22getSkipCrossAttnBlocksEv", false]], "tensorrt_llm::executor::request::getstopwords (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request12getStopWordsEv", false]], "tensorrt_llm::executor::request::getstreaming (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request12getStreamingEv", false]], "tensorrt_llm::executor::request::kbatchedpostprocessorname (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor7Request25kBatchedPostProcessorNameE", false]], "tensorrt_llm::executor::request::kdefaultpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor7Request16kDefaultPriorityE", false]], "tensorrt_llm::executor::request::kdynamicpostprocessornameprefix (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor7Request31kDynamicPostProcessorNamePrefixE", false]], "tensorrt_llm::executor::request::mimpl (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor7Request5mImplE", false]], "tensorrt_llm::executor::request::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7RequestaSERK7Request", false], [0, "_CPPv4N12tensorrt_llm8executor7RequestaSERR7Request", false]], "tensorrt_llm::executor::request::request (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", false], [0, "_CPPv4N12tensorrt_llm8executor7Request7RequestERK7Request", false], [0, "_CPPv4N12tensorrt_llm8executor7Request7RequestERR7Request", false]], "tensorrt_llm::executor::request::setallottedtimems (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request17setAllottedTimeMsE16MillisecondsType", false]], "tensorrt_llm::executor::request::setbadwords (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request11setBadWordsERKNSt4listI9VecTokensEE", false]], "tensorrt_llm::executor::request::setclientid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request11setClientIdE6IdType", false]], "tensorrt_llm::executor::request::setcontextphaseparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request21setContextPhaseParamsE18ContextPhaseParams", false]], "tensorrt_llm::executor::request::setcrossattentionmask (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request21setCrossAttentionMaskE6Tensor", false]], "tensorrt_llm::executor::request::seteagleconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request14setEagleConfigERKNSt8optionalI11EagleConfigEE", false]], "tensorrt_llm::executor::request::setembeddingbias (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request16setEmbeddingBiasERK6Tensor", false]], "tensorrt_llm::executor::request::setencoderinputfeatures (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputFeaturesE6Tensor", false]], "tensorrt_llm::executor::request::setencoderinputtokenids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputTokenIdsERK9VecTokens", false]], "tensorrt_llm::executor::request::setencoderoutputlength (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request22setEncoderOutputLengthE10SizeType32", false]], "tensorrt_llm::executor::request::setendid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request8setEndIdE10SizeType32", false]], "tensorrt_llm::executor::request::setexternaldrafttokensconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request28setExternalDraftTokensConfigERK25ExternalDraftTokensConfig", false]], "tensorrt_llm::executor::request::setguideddecodingparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request23setGuidedDecodingParamsERK20GuidedDecodingParams", false]], "tensorrt_llm::executor::request::setkvcacheretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request25setKvCacheRetentionConfigERK22KvCacheRetentionConfig", false]], "tensorrt_llm::executor::request::setlanguageadapteruid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request21setLanguageAdapterUidE10SizeType32", false]], "tensorrt_llm::executor::request::setlogitspostprocessor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request22setLogitsPostProcessorERKNSt8optionalI19LogitsPostProcessorEE", false]], "tensorrt_llm::executor::request::setlogitspostprocessorname (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request26setLogitsPostProcessorNameERKNSt6stringE", false]], "tensorrt_llm::executor::request::setlookaheadconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request18setLookaheadConfigERK23LookaheadDecodingConfig", false]], "tensorrt_llm::executor::request::setloraconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request13setLoraConfigERK10LoraConfig", false]], "tensorrt_llm::executor::request::setmropeconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request14setMropeConfigERK11MropeConfig", false]], "tensorrt_llm::executor::request::setmultimodalembedding (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request22setMultimodalEmbeddingERK6Tensor", false]], "tensorrt_llm::executor::request::setoutputconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request15setOutputConfigERK12OutputConfig", false]], "tensorrt_llm::executor::request::setpadid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request8setPadIdE10SizeType32", false]], "tensorrt_llm::executor::request::setpositionids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request14setPositionIdsERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::executor::request::setpriority (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request11setPriorityE12PriorityType", false]], "tensorrt_llm::executor::request::setprompttuningconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request21setPromptTuningConfigERK18PromptTuningConfig", false]], "tensorrt_llm::executor::request::setrequesttype (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request14setRequestTypeERK11RequestType", false]], "tensorrt_llm::executor::request::setreturnallgeneratedtokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request27setReturnAllGeneratedTokensEb", false]], "tensorrt_llm::executor::request::setsamplingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request17setSamplingConfigERK14SamplingConfig", false]], "tensorrt_llm::executor::request::setskipcrossattnblocks (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request22setSkipCrossAttnBlocksE6Tensor", false]], "tensorrt_llm::executor::request::setstopwords (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request12setStopWordsERKNSt4listI9VecTokensEE", false]], "tensorrt_llm::executor::request::setstreaming (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request12setStreamingEb", false]], "tensorrt_llm::executor::request::~request (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7RequestD0Ev", false]], "tensorrt_llm::executor::requestperfmetrics (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::firstiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics9firstIterE", false]], "tensorrt_llm::executor::requestperfmetrics::iter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics4iterE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14kvCacheMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::kvcachehitrate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics14kvCacheHitRateE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::nummissedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics15numMissedBlocksE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::numnewallocatedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics21numNewAllocatedBlocksE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::numreusedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics15numReusedBlocksE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::numtotalallocatedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics23numTotalAllocatedBlocksE", false]], "tensorrt_llm::executor::requestperfmetrics::lastiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics8lastIterE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecoding (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics19speculativeDecodingE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecodingmetrics (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecodingmetrics::acceptancerate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics14acceptanceRateE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecodingmetrics::totalaccepteddrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics24totalAcceptedDraftTokensE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecodingmetrics::totaldrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics16totalDraftTokensE", false]], "tensorrt_llm::executor::requestperfmetrics::timepoint (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics9TimePointE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13timingMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::arrivaltime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics11arrivalTimeE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::firstscheduledtime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics18firstScheduledTimeE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::firsttokentime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics14firstTokenTimeE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::kvcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics11kvCacheSizeE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::kvcachetransferend (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics18kvCacheTransferEndE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::kvcachetransferstart (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics20kvCacheTransferStartE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::lasttokentime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics13lastTokenTimeE", false]], "tensorrt_llm::executor::requeststage (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStageE", false]], "tensorrt_llm::executor::requeststage::kcontext_in_progress (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage20kCONTEXT_IN_PROGRESSE", false]], "tensorrt_llm::executor::requeststage::kencoder_in_progress (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage20kENCODER_IN_PROGRESSE", false]], "tensorrt_llm::executor::requeststage::kgeneration_complete (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage20kGENERATION_COMPLETEE", false]], "tensorrt_llm::executor::requeststage::kgeneration_in_progress (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage23kGENERATION_IN_PROGRESSE", false]], "tensorrt_llm::executor::requeststage::kqueued (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage7kQUEUEDE", false]], "tensorrt_llm::executor::requeststats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStatsE", false]], "tensorrt_llm::executor::requeststats::allocnewblocksperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats24allocNewBlocksPerRequestE", false]], "tensorrt_llm::executor::requeststats::alloctotalblocksperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats26allocTotalBlocksPerRequestE", false]], "tensorrt_llm::executor::requeststats::avgnumdecodedtokensperiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats26avgNumDecodedTokensPerIterE", false]], "tensorrt_llm::executor::requeststats::contextprefillposition (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats22contextPrefillPositionE", false]], "tensorrt_llm::executor::requeststats::disservingstats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats15disServingStatsE", false]], "tensorrt_llm::executor::requeststats::id (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats2idE", false]], "tensorrt_llm::executor::requeststats::kvcachehitrateperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats24kvCacheHitRatePerRequestE", false]], "tensorrt_llm::executor::requeststats::missedblocksperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats22missedBlocksPerRequestE", false]], "tensorrt_llm::executor::requeststats::numgeneratedtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats18numGeneratedTokensE", false]], "tensorrt_llm::executor::requeststats::paused (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats6pausedE", false]], "tensorrt_llm::executor::requeststats::reusedblocksperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats22reusedBlocksPerRequestE", false]], "tensorrt_llm::executor::requeststats::scheduled (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats9scheduledE", false]], "tensorrt_llm::executor::requeststats::stage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats5stageE", false]], "tensorrt_llm::executor::requeststatsperiteration (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIterationE", false]], "tensorrt_llm::executor::requeststatsperiteration::iter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIteration4iterE", false]], "tensorrt_llm::executor::requeststatsperiteration::requeststats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIteration12requestStatsE", false]], "tensorrt_llm::executor::requesttype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor11RequestTypeE", false]], "tensorrt_llm::executor::requesttype::request_type_context_and_generation (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor11RequestType35REQUEST_TYPE_CONTEXT_AND_GENERATIONE", false]], "tensorrt_llm::executor::requesttype::request_type_context_only (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor11RequestType25REQUEST_TYPE_CONTEXT_ONLYE", false]], "tensorrt_llm::executor::requesttype::request_type_generation_only (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor11RequestType28REQUEST_TYPE_GENERATION_ONLYE", false]], "tensorrt_llm::executor::response (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8ResponseE", false]], "tensorrt_llm::executor::response::getclientid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response11getClientIdEv", false]], "tensorrt_llm::executor::response::geterrormsg (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response11getErrorMsgEv", false]], "tensorrt_llm::executor::response::getrequestid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response12getRequestIdEv", false]], "tensorrt_llm::executor::response::getresult (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response9getResultEv", false]], "tensorrt_llm::executor::response::haserror (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response8hasErrorEv", false]], "tensorrt_llm::executor::response::mimpl (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8Response5mImplE", false]], "tensorrt_llm::executor::response::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8ResponseaSERK8Response", false], [0, "_CPPv4N12tensorrt_llm8executor8ResponseaSERR8Response", false]], "tensorrt_llm::executor::response::response (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERK8Response", false], [0, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERR8Response", false]], "tensorrt_llm::executor::response::~response (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8ResponseD0Ev", false]], "tensorrt_llm::executor::result (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor6ResultE", false]], "tensorrt_llm::executor::result::additionaloutputs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result17additionalOutputsE", false]], "tensorrt_llm::executor::result::contextlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result13contextLogitsE", false]], "tensorrt_llm::executor::result::contextphaseparams (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result18contextPhaseParamsE", false]], "tensorrt_llm::executor::result::cumlogprobs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result11cumLogProbsE", false]], "tensorrt_llm::executor::result::decodingiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result12decodingIterE", false]], "tensorrt_llm::executor::result::encoderoutput (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result13encoderOutputE", false]], "tensorrt_llm::executor::result::finishreasons (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result13finishReasonsE", false]], "tensorrt_llm::executor::result::generationlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result16generationLogitsE", false]], "tensorrt_llm::executor::result::isfinal (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result7isFinalE", false]], "tensorrt_llm::executor::result::issequencefinal (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result15isSequenceFinalE", false]], "tensorrt_llm::executor::result::logprobs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result8logProbsE", false]], "tensorrt_llm::executor::result::outputtokenids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result14outputTokenIdsE", false]], "tensorrt_llm::executor::result::requestperfmetrics (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result18requestPerfMetricsE", false]], "tensorrt_llm::executor::result::sequenceindex (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result13sequenceIndexE", false]], "tensorrt_llm::executor::result::specdecfastlogitsinfo (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result21specDecFastLogitsInfoE", false]], "tensorrt_llm::executor::retentionpriority (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor17RetentionPriorityE", false]], "tensorrt_llm::executor::retentionpriorityandduration (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDurationE", false]], "tensorrt_llm::executor::retentionpriorityandduration::durationms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration10durationMsE", false]], "tensorrt_llm::executor::retentionpriorityandduration::retentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration17retentionPriorityE", false]], "tensorrt_llm::executor::retentionpriorityandduration::retentionpriorityandduration (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration28RetentionPriorityAndDurationERKNSt8optionalI17RetentionPriorityEERKNSt8optionalINSt6chrono12millisecondsEEE", false]], "tensorrt_llm::executor::samplingconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfigE", false]], "tensorrt_llm::executor::samplingconfig::checkbeamsearchdiversityrate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig28checkBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checkbeamwidth (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkBeamWidthE10SizeType32", false]], "tensorrt_llm::executor::samplingconfig::checkbeamwidtharray (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19checkBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEEK10SizeType32", false]], "tensorrt_llm::executor::samplingconfig::checkearlystopping (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkEarlyStoppingERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::checklengthpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkLengthPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checkminp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkMinPERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checkmintokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkMinTokensERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::checknorepeatngramsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::checknumreturnsequences (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig23checkNumReturnSequencesERKNSt8optionalI10SizeType32EE10SizeType32", false]], "tensorrt_llm::executor::samplingconfig::checkrepetitionpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktemperature (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16checkTemperatureERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktopk (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopKERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktopp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopPERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktoppdecay (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkTopPDecayERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktoppmin (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12checkTopPMinERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktoppresetids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17checkTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", false]], "tensorrt_llm::executor::samplingconfig::getbeamsearchdiversityrate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig26getBeamSearchDiversityRateEv", false]], "tensorrt_llm::executor::samplingconfig::getbeamwidth (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getBeamWidthEv", false]], "tensorrt_llm::executor::samplingconfig::getbeamwidtharray (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig17getBeamWidthArrayEv", false]], "tensorrt_llm::executor::samplingconfig::getearlystopping (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig16getEarlyStoppingEv", false]], "tensorrt_llm::executor::samplingconfig::getfrequencypenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig19getFrequencyPenaltyEv", false]], "tensorrt_llm::executor::samplingconfig::getlengthpenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig16getLengthPenaltyEv", false]], "tensorrt_llm::executor::samplingconfig::getminp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getMinPEv", false]], "tensorrt_llm::executor::samplingconfig::getmintokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getMinTokensEv", false]], "tensorrt_llm::executor::samplingconfig::getnorepeatngramsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig20getNoRepeatNgramSizeEv", false]], "tensorrt_llm::executor::samplingconfig::getnumreturnbeams (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig17getNumReturnBeamsEv", false]], "tensorrt_llm::executor::samplingconfig::getnumreturnsequences (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig21getNumReturnSequencesEv", false]], "tensorrt_llm::executor::samplingconfig::getpresencepenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig18getPresencePenaltyEv", false]], "tensorrt_llm::executor::samplingconfig::getrepetitionpenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig20getRepetitionPenaltyEv", false]], "tensorrt_llm::executor::samplingconfig::getseed (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getSeedEv", false]], "tensorrt_llm::executor::samplingconfig::gettemperature (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig14getTemperatureEv", false]], "tensorrt_llm::executor::samplingconfig::gettopk (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getTopKEv", false]], "tensorrt_llm::executor::samplingconfig::gettopp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getTopPEv", false]], "tensorrt_llm::executor::samplingconfig::gettoppdecay (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getTopPDecayEv", false]], "tensorrt_llm::executor::samplingconfig::gettoppmin (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig10getTopPMinEv", false]], "tensorrt_llm::executor::samplingconfig::gettoppresetids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig15getTopPResetIdsEv", false]], "tensorrt_llm::executor::samplingconfig::mbeamsearchdiversityrate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig24mBeamSearchDiversityRateE", false]], "tensorrt_llm::executor::samplingconfig::mbeamwidth (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mBeamWidthE", false]], "tensorrt_llm::executor::samplingconfig::mbeamwidtharray (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15mBeamWidthArrayE", false]], "tensorrt_llm::executor::samplingconfig::mearlystopping (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14mEarlyStoppingE", false]], "tensorrt_llm::executor::samplingconfig::mfrequencypenalty (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17mFrequencyPenaltyE", false]], "tensorrt_llm::executor::samplingconfig::mlengthpenalty (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14mLengthPenaltyE", false]], "tensorrt_llm::executor::samplingconfig::mminp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mMinPE", false]], "tensorrt_llm::executor::samplingconfig::mmintokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mMinTokensE", false]], "tensorrt_llm::executor::samplingconfig::mnorepeatngramsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18mNoRepeatNgramSizeE", false]], "tensorrt_llm::executor::samplingconfig::mnumreturnbeams (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15mNumReturnBeamsE", false]], "tensorrt_llm::executor::samplingconfig::mnumreturnsequences (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19mNumReturnSequencesE", false]], "tensorrt_llm::executor::samplingconfig::mpresencepenalty (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16mPresencePenaltyE", false]], "tensorrt_llm::executor::samplingconfig::mrepetitionpenalty (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18mRepetitionPenaltyE", false]], "tensorrt_llm::executor::samplingconfig::mseed (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mSeedE", false]], "tensorrt_llm::executor::samplingconfig::mtemperature (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12mTemperatureE", false]], "tensorrt_llm::executor::samplingconfig::mtopk (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mTopKE", false]], "tensorrt_llm::executor::samplingconfig::mtopp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mTopPE", false]], "tensorrt_llm::executor::samplingconfig::mtoppdecay (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mTopPDecayE", false]], "tensorrt_llm::executor::samplingconfig::mtoppmin (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig8mTopPMinE", false]], "tensorrt_llm::executor::samplingconfig::mtoppresetids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig13mTopPResetIdsE", false]], "tensorrt_llm::executor::samplingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfigeqERK14SamplingConfig", false]], "tensorrt_llm::executor::samplingconfig::samplingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", false]], "tensorrt_llm::executor::samplingconfig::setbeamsearchdiversityrate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig26setBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setbeamwidth (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setBeamWidthE10SizeType32", false]], "tensorrt_llm::executor::samplingconfig::setbeamwidtharray (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17setBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEE", false]], "tensorrt_llm::executor::samplingconfig::setearlystopping (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setEarlyStoppingERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::setfrequencypenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19setFrequencyPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setlengthpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setLengthPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setminp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setMinPERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setmintokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setMinTokensERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::setnorepeatngramsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::setnumreturnsequences (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig21setNumReturnSequencesERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::setpresencepenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18setPresencePenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setrepetitionpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setseed (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setSeedERKNSt8optionalI14RandomSeedTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settemperature (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14setTemperatureERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settopk (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopKERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::settopp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopPERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settoppdecay (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setTopPDecayERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settoppmin (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10setTopPMinERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settoppresetids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15setTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", false]], "tensorrt_llm::executor::samplingconfig::updatenumreturnbeams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20updateNumReturnBeamsEv", false]], "tensorrt_llm::executor::schedulerconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfigE", false]], "tensorrt_llm::executor::schedulerconfig::getcapacityschedulerpolicy (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig26getCapacitySchedulerPolicyEv", false]], "tensorrt_llm::executor::schedulerconfig::getcontextchunkingpolicy (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig24getContextChunkingPolicyEv", false]], "tensorrt_llm::executor::schedulerconfig::getdynamicbatchconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig21getDynamicBatchConfigEv", false]], "tensorrt_llm::executor::schedulerconfig::mcapacityschedulerpolicy (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig24mCapacitySchedulerPolicyE", false]], "tensorrt_llm::executor::schedulerconfig::mcontextchunkingpolicy (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig22mContextChunkingPolicyE", false]], "tensorrt_llm::executor::schedulerconfig::mdynamicbatchconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig19mDynamicBatchConfigE", false]], "tensorrt_llm::executor::schedulerconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfigeqERK15SchedulerConfig", false]], "tensorrt_llm::executor::schedulerconfig::schedulerconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", false]], "tensorrt_llm::executor::serialization (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor13SerializationE", false]], "tensorrt_llm::executor::serialization::deserializeadditionalmodeloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeAdditionalModelOutputERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeadditionaloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization27deserializeAdditionalOutputERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializebool (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization15deserializeBoolERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializecachestate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializecachetransceiverconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeCacheTransceiverConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializecommstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeCommStateERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializecontextphaseparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeContextPhaseParamsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedatatransceiverstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt6vectorIcEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedebugconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeDebugConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeDecodingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedecodingmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeDecodingModeERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedisservingrequeststats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeDisServingRequestStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedynamicbatchconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeDynamicBatchConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeeagleconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeEagleConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeexecutorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeExecutorConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeextendedruntimeperfknobconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization40deserializeExtendedRuntimePerfKnobConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeexternaldrafttokensconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeExternalDraftTokensConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeguideddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeguideddecodingparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingParamsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeinflightbatchingstats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeInflightBatchingStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeiterationstats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeiterationstatsvec (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization28deserializeIterationStatsVecERNSt6vectorIcEE", false]], "tensorrt_llm::executor::serialization::deserializekvcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization24deserializeKvCacheConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializekvcacheretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeKvCacheRetentionConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializekvcachestats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeKvCacheStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializelookaheaddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization34deserializeLookaheadDecodingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeloraconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeLoraConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializemodeltype (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeModelTypeERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializemropeconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeMropeConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeorchestratorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeOrchestratorConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeoutputconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeOutputConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeparallelconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeParallelConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializepeftcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializePeftCacheConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeprompttuningconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializePromptTuningConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequest (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization18deserializeRequestERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequestperfmetrics (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeRequestPerfMetricsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequeststage (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStageERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequeststats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequeststatsperiteration (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt6vectorIcEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequeststatsperiterationvec (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization38deserializeRequestStatsPerIterationVecERNSt6vectorIcEE", false]], "tensorrt_llm::executor::serialization::deserializeresponse (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization19deserializeResponseERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeresponses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeResponsesERNSt6vectorIcEE", false]], "tensorrt_llm::executor::serialization::deserializeresult (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeResultERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializesamplingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeSamplingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeschedulerconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializeSchedulerConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializesocketstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeSocketStateERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializespecdecfastlogitsinfo (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeSpecDecFastLogitsInfoERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializespeculativedecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeSpeculativeDecodingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializestaticbatchingstats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization30deserializeStaticBatchingStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializestring (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeStringERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializetensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeTensorERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializetimepoint (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeTimePointERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializetokenrangeretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeTokenRangeRetentionConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::serialize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverState", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIteration", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI14IterationStatsEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI24RequestStatsPerIterationEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI8ResponseEE", false]], "tensorrt_llm::executor::serialization::serializedsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK10LoraConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11DebugConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11EagleConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11MropeConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12DecodingMode", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12KvCacheStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12OutputConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStage", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK13KvCacheConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14DecodingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14IterationStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ParallelConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14SamplingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15PeftCacheConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15SchedulerConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK16AdditionalOutput", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18ContextPhaseParams", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18DynamicBatchConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18OrchestratorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18PromptTuningConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18RequestPerfMetrics", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK19StaticBatchingStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20DataTransceiverState", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingParams", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21AdditionalModelOutput", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22CacheTransceiverConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22DisServingRequestStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22KvCacheRetentionConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK23LookaheadDecodingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK24RequestStatsPerIteration", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25ExternalDraftTokensConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25SpeculativeDecodingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK29ExtendedRuntimePerfKnobConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK33SpeculativeDecodingFastLogitsInfo", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Result", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK7Request", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK8Response", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN18RequestPerfMetrics9TimePointE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10CacheStateE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache9CommStateE", false]], "tensorrt_llm::executor::shape (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor5ShapeE", false]], "tensorrt_llm::executor::shape::base (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor5Shape4BaseE", false]], "tensorrt_llm::executor::shape::dimtype64 (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor5Shape9DimType64E", false]], "tensorrt_llm::executor::shape::shape (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeENSt16initializer_listI9DimType64EE", false], [0, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEPK9DimType64N4Base9size_typeE", false], [0, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEv", false]], "tensorrt_llm::executor::sizetype32 (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor10SizeType32E", false]], "tensorrt_llm::executor::speculativedecodingconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfigE", false]], "tensorrt_llm::executor::speculativedecodingconfig::fastlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig10fastLogitsE", false]], "tensorrt_llm::executor::speculativedecodingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25SpeculativeDecodingConfigeqERK25SpeculativeDecodingConfig", false]], "tensorrt_llm::executor::speculativedecodingconfig::speculativedecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig25SpeculativeDecodingConfigEb", false]], "tensorrt_llm::executor::speculativedecodingfastlogitsinfo (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfoE", false]], "tensorrt_llm::executor::speculativedecodingfastlogitsinfo::draftparticipantid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo18draftParticipantIdE", false]], "tensorrt_llm::executor::speculativedecodingfastlogitsinfo::draftrequestid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo14draftRequestIdE", false]], "tensorrt_llm::executor::speculativedecodingfastlogitsinfo::totensor (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo8toTensorEv", false]], "tensorrt_llm::executor::staticbatchingstats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStatsE", false]], "tensorrt_llm::executor::staticbatchingstats::emptygenslots (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats13emptyGenSlotsE", false]], "tensorrt_llm::executor::staticbatchingstats::numcontextrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats18numContextRequestsE", false]], "tensorrt_llm::executor::staticbatchingstats::numctxtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats12numCtxTokensE", false]], "tensorrt_llm::executor::staticbatchingstats::numgentokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats12numGenTokensE", false]], "tensorrt_llm::executor::staticbatchingstats::numscheduledrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats20numScheduledRequestsE", false]], "tensorrt_llm::executor::streamptr (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor9StreamPtrE", false]], "tensorrt_llm::executor::tensor (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor6TensorE", false]], "tensorrt_llm::executor::tensor::copyto (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor6copyToENSt10shared_ptrI4ImplEE13CudaStreamPtr", false]], "tensorrt_llm::executor::tensor::copytocpu (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToCpuEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::copytogpu (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToGpuEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::copytomanaged (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor13copyToManagedEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::copytopinned (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor12copyToPinnedEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::copytopooledpinned (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor18copyToPooledPinnedEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::cpu (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3cpuE6Tensor5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor3cpuE8DataType5Shape", false]], "tensorrt_llm::executor::tensor::cudastreamptr (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::detail::ofitensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", false]], "tensorrt_llm::executor::tensor::detail::toitensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9toITensorERK6Tensor", false]], "tensorrt_llm::executor::tensor::getdata (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor7getDataEv", false], [0, "_CPPv4NK12tensorrt_llm8executor6Tensor7getDataEv", false]], "tensorrt_llm::executor::tensor::getdatatype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor11getDataTypeEv", false]], "tensorrt_llm::executor::tensor::getmemorytype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor13getMemoryTypeEv", false]], "tensorrt_llm::executor::tensor::getruntimetype (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor14getRuntimeTypeE8DataTypev", false]], "tensorrt_llm::executor::tensor::getshape (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor8getShapeEv", false]], "tensorrt_llm::executor::tensor::getsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor7getSizeEv", false]], "tensorrt_llm::executor::tensor::getsizeinbytes (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor14getSizeInBytesEv", false]], "tensorrt_llm::executor::tensor::gpu (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", false]], "tensorrt_llm::executor::tensor::impl (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor4ImplE", false]], "tensorrt_llm::executor::tensor::managed (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor7managedE6Tensor5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor7managedE8DataType5Shape", false]], "tensorrt_llm::executor::tensor::mtensor (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor7mTensorE", false]], "tensorrt_llm::executor::tensor::of (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", false], [0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorR1T", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", false]], "tensorrt_llm::executor::tensor::operator bool (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6TensorcvbEv", false]], "tensorrt_llm::executor::tensor::operator!= (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6TensorneERK6Tensor", false]], "tensorrt_llm::executor::tensor::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6TensoraSERK6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor6TensoraSERR6Tensor", false]], "tensorrt_llm::executor::tensor::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6TensoreqERK6Tensor", false]], "tensorrt_llm::executor::tensor::pinned (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor6pinnedE6Tensor5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor6pinnedE8DataType5Shape", false]], "tensorrt_llm::executor::tensor::pooledpinned (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor12pooledPinnedE6Tensor5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor12pooledPinnedE8DataType5Shape", false]], "tensorrt_llm::executor::tensor::setfrom (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor7setFromERK6Tensor13CudaStreamPtr", false]], "tensorrt_llm::executor::tensor::setzero (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor7setZeroE13CudaStreamPtr", false]], "tensorrt_llm::executor::tensor::tensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorENSt10shared_ptrIN7runtime7ITensorEEE", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERK6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERR6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorEv", false]], "tensorrt_llm::executor::tensor::~tensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6TensorD0Ev", false]], "tensorrt_llm::executor::tensorptr (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor9TensorPtrE", false]], "tensorrt_llm::executor::tokenidtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor11TokenIdTypeE", false]], "tensorrt_llm::executor::typetraits (c++ struct)": [[0, "_CPPv4I0_bEN12tensorrt_llm8executor10TypeTraitsE", false]], "tensorrt_llm::executor::typetraits<bool> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsIbEE", false]], "tensorrt_llm::executor::typetraits<bool>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIbE5valueE", false]], "tensorrt_llm::executor::typetraits<float> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsIfEE", false]], "tensorrt_llm::executor::typetraits<float>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIfE5valueE", false]], "tensorrt_llm::executor::typetraits<half> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsI4halfEE", false]], "tensorrt_llm::executor::typetraits<half>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsI4halfE5valueE", false]], "tensorrt_llm::executor::typetraits<std::int32_t> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7int32_tEEE", false]], "tensorrt_llm::executor::typetraits<std::int32_t>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7int32_tEE5valueE", false]], "tensorrt_llm::executor::typetraits<std::int64_t> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7int64_tEEE", false]], "tensorrt_llm::executor::typetraits<std::int64_t>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7int64_tEE5valueE", false]], "tensorrt_llm::executor::typetraits<std::int8_t> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt6int8_tEEE", false]], "tensorrt_llm::executor::typetraits<std::int8_t>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt6int8_tEE5valueE", false]], "tensorrt_llm::executor::typetraits<std::uint8_t> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7uint8_tEEE", false]], "tensorrt_llm::executor::typetraits<std::uint8_t>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7uint8_tEE5valueE", false]], "tensorrt_llm::executor::typetraits<t*> (c++ struct)": [[0, "_CPPv4I0EN12tensorrt_llm8executor10TypeTraitsIP1TEE", false]], "tensorrt_llm::executor::typetraits<t*>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIP1TE5valueE", false]], "tensorrt_llm::executor::veclogprobs (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor11VecLogProbsE", false]], "tensorrt_llm::executor::vectokenextraids (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor16VecTokenExtraIdsE", false]], "tensorrt_llm::executor::vectokens (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor9VecTokensE", false]], "tensorrt_llm::executor::version (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7versionEv", false]], "tensorrt_llm::layers (c++ type)": [[1, "_CPPv4N12tensorrt_llm6layersE", false]], "tensorrt_llm::mpi (c++ type)": [[0, "_CPPv4N12tensorrt_llm3mpiE", false]], "tensorrt_llm::runtime (c++ type)": [[0, "_CPPv4N12tensorrt_llm7runtimeE", false], [1, "_CPPv4N12tensorrt_llm7runtimeE", false]], "tensorrt_llm::runtime::allreducebuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffersE", false]], "tensorrt_llm::runtime::allreducebuffers::allreducebuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", false]], "tensorrt_llm::runtime::allreducebuffers::mallreducecommptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE", false]], "tensorrt_llm::runtime::allreducebuffers::mipcmemoryhandles (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE", false]], "tensorrt_llm::runtime::allreducebuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::buffercast (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEP1TR7IBuffer", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEPK1TRK7IBuffer", false]], "tensorrt_llm::runtime::buffercastornull (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7IBuffer9SharedPtrE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7ITensor9SharedPtrE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7IBuffer9SharedPtrEEE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7ITensor9SharedPtrEEE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7IBuffer14SharedConstPtrE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7ITensor14SharedConstPtrE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7IBuffer14SharedConstPtrEEE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7ITensor14SharedConstPtrEEE", false]], "tensorrt_llm::runtime::bufferdatatype (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataTypeE", false]], "tensorrt_llm::runtime::bufferdatatype::bufferdatatype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", false]], "tensorrt_llm::runtime::bufferdatatype::getdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType11getDataTypeEv", false]], "tensorrt_llm::runtime::bufferdatatype::getsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType7getSizeEv", false]], "tensorrt_llm::runtime::bufferdatatype::getsizeinbits (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType13getSizeInBitsEv", false]], "tensorrt_llm::runtime::bufferdatatype::ispointer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType9isPointerEv", false]], "tensorrt_llm::runtime::bufferdatatype::isunsigned (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType10isUnsignedEv", false]], "tensorrt_llm::runtime::bufferdatatype::ktrtpointertype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType15kTrtPointerTypeE", false]], "tensorrt_llm::runtime::bufferdatatype::mdatatype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType9mDataTypeE", false]], "tensorrt_llm::runtime::bufferdatatype::mpointer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType8mPointerE", false]], "tensorrt_llm::runtime::bufferdatatype::munsigned (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType9mUnsignedE", false]], "tensorrt_llm::runtime::bufferdatatype::operator nvinfer1::datatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataTypecvN8nvinfer18DataTypeEEv", false]], "tensorrt_llm::runtime::buffermanager (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManagerE", false]], "tensorrt_llm::runtime::buffermanager::allocate (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::buffermanager (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13BufferManagerE13CudaStreamPtrb", false]], "tensorrt_llm::runtime::buffermanager::copy (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferR7IBuffer", false]], "tensorrt_llm::runtime::buffermanager::copyfrom (c++ function)": [[1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", false], [1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", false], [1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7IBuffer10MemoryType", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7ITensor10MemoryType", false]], "tensorrt_llm::runtime::buffermanager::cpu (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::cudamempoolptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager14CudaMemPoolPtrE", false]], "tensorrt_llm::runtime::buffermanager::cudastreamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13CudaStreamPtrE", false]], "tensorrt_llm::runtime::buffermanager::emptybuffer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyBufferE10MemoryTypeN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::emptytensor (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyTensorE10MemoryTypeN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::getstream (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager9getStreamEv", false]], "tensorrt_llm::runtime::buffermanager::gpu (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::gpusync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::ibufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10IBufferPtrE", false]], "tensorrt_llm::runtime::buffermanager::ipcnvls (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::itensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10ITensorPtrE", false]], "tensorrt_llm::runtime::buffermanager::kbyte_type (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10kBYTE_TYPEE", false]], "tensorrt_llm::runtime::buffermanager::managed (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::memorypoolfree (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager14memoryPoolFreeEv", false]], "tensorrt_llm::runtime::buffermanager::memorypoolreserved (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager18memoryPoolReservedEv", false]], "tensorrt_llm::runtime::buffermanager::memorypooltrimto (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager16memoryPoolTrimToENSt6size_tE", false]], "tensorrt_llm::runtime::buffermanager::memorypoolused (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager14memoryPoolUsedEv", false]], "tensorrt_llm::runtime::buffermanager::mpool (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager5mPoolE", false]], "tensorrt_llm::runtime::buffermanager::mstream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7mStreamE", false]], "tensorrt_llm::runtime::buffermanager::mtrimpool (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager9mTrimPoolE", false]], "tensorrt_llm::runtime::buffermanager::pinned (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::pinnedpool (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::setmem (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager6setMemER7IBuffer7int32_t", false]], "tensorrt_llm::runtime::buffermanager::setzero (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager7setZeroER7IBuffer", false]], "tensorrt_llm::runtime::buffermanager::~buffermanager (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManagerD0Ev", false]], "tensorrt_llm::runtime::bufferrange (c++ class)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime11BufferRangeE", false]], "tensorrt_llm::runtime::bufferrange::base (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime11BufferRange4BaseE", false]], "tensorrt_llm::runtime::bufferrange::bufferrange (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI1UEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeERK7IBuffer", false], [1, "_CPPv4I0_NSt11enable_if_tIXntNSt10is_const_vI1UEEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeER7IBuffer", false], [1, "_CPPv4N12tensorrt_llm7runtime11BufferRange11BufferRangeEP1T9size_type", false]], "tensorrt_llm::runtime::canaccesspeer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13canAccessPeerERK11WorldConfig", false]], "tensorrt_llm::runtime::constpointercast (c++ function)": [[1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERKNSt10shared_ptrI1TEE", false]], "tensorrt_llm::runtime::cudaevent (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEventE", false]], "tensorrt_llm::runtime::cudaevent::cudaevent (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventE7pointerb", false], [1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventEj", false]], "tensorrt_llm::runtime::cudaevent::deleter (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7DeleterE", false]], "tensorrt_llm::runtime::cudaevent::deleter::deleter (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEb", false], [1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEv", false]], "tensorrt_llm::runtime::cudaevent::deleter::mownsevent (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter10mOwnsEventE", false]], "tensorrt_llm::runtime::cudaevent::deleter::operator() (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent7DeleterclE7pointer", false]], "tensorrt_llm::runtime::cudaevent::element_type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent12element_typeE", false]], "tensorrt_llm::runtime::cudaevent::eventptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent8EventPtrE", false]], "tensorrt_llm::runtime::cudaevent::get (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent3getEv", false]], "tensorrt_llm::runtime::cudaevent::mevent (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent6mEventE", false]], "tensorrt_llm::runtime::cudaevent::pointer (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7pointerE", false]], "tensorrt_llm::runtime::cudaevent::synchronize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent11synchronizeEv", false]], "tensorrt_llm::runtime::cudastream (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStreamE", false]], "tensorrt_llm::runtime::cudastream::cudastream (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_t", false], [1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", false], [1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamEji", false]], "tensorrt_llm::runtime::cudastream::deleter (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7DeleterE", false]], "tensorrt_llm::runtime::cudastream::deleter::deleter (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEb", false], [1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEv", false]], "tensorrt_llm::runtime::cudastream::deleter::mownsstream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter11mOwnsStreamE", false]], "tensorrt_llm::runtime::cudastream::deleter::operator() (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream7DeleterclE12cudaStream_t", false]], "tensorrt_llm::runtime::cudastream::get (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream3getEv", false]], "tensorrt_llm::runtime::cudastream::getdevice (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream9getDeviceEv", false]], "tensorrt_llm::runtime::cudastream::mdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7mDeviceE", false]], "tensorrt_llm::runtime::cudastream::mstream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7mStreamE", false]], "tensorrt_llm::runtime::cudastream::record (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordEN9CudaEvent7pointerE", false], [1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordERK9CudaEvent", false]], "tensorrt_llm::runtime::cudastream::streamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream9StreamPtrE", false]], "tensorrt_llm::runtime::cudastream::synchronize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream11synchronizeEv", false]], "tensorrt_llm::runtime::cudastream::wait (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitEN9CudaEvent7pointerE", false], [1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitERK9CudaEvent", false]], "tensorrt_llm::runtime::datatypetraits (c++ struct)": [[1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", false]], "tensorrt_llm::runtime::datatypetraits<kdatatype, kunsigned, true> (c++ struct)": [[1, "_CPPv4I_N8nvinfer18DataTypeE_bEN12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEEE", false]], "tensorrt_llm::runtime::datatypetraits<kdatatype, kunsigned, true>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<kdatatype, kunsigned, true>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<kdatatype, kunsigned, true>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kbool, kunsigned> (c++ struct)": [[1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kbool, kunsigned>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kbool, kunsigned>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kbool, kunsigned>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kfloat> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kfloat>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kfloat>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kfloat>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::khalf> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::khalf>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::khalf>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::khalf>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32, true> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32, true>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32, true>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32, true>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64, true> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64, true>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64, true>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64, true>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint8> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint8>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint8>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint8>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kuint8, kunsigned> (c++ struct)": [[1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kuint8, kunsigned>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kuint8, kunsigned>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kuint8, kunsigned>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4typeE", false]], "tensorrt_llm::runtime::decoder (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoderE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffersE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::beamsearchbuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers17BeamSearchBuffersERK13BufferManager", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::mcumlogprobstmp (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers15mCumLogProbsTmpE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::mnumsms (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7mNumSMsE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::moutputbeamhypotheses (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers21mOutputBeamHypothesesE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7reshapeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::decoder::decoderstate (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE", false]], "tensorrt_llm::runtime::decoder::decoderstate::allocatespeculativedecodingbuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", false]], "tensorrt_llm::runtime::decoder::decoderstate::decoderstate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState12DecoderStateEN8nvinfer18DataTypeERK13BufferManager", false]], "tensorrt_llm::runtime::decoder::decoderstate::decodinginputptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16DecodingInputPtrE", false]], "tensorrt_llm::runtime::decoder::decoderstate::decodingoutputptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState17DecodingOutputPtrE", false]], "tensorrt_llm::runtime::decoder::decoderstate::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16disableLookaheadERK13RequestVector", false]], "tensorrt_llm::runtime::decoder::decoderstate::getacceptedlengthscumsum (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24getAcceptedLengthsCumSumEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getacceptedpackedpaths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getAcceptedPackedPathsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getallnewtokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getbeamsearchbuffers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getBeamSearchBuffersEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getcumlogprobs (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getfinishedsteps (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishedStepsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getfinishedsum (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getFinishedSumEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getfinishreasons (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishReasonsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getgatheredids (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getids (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getjointdecodinginput (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState21getJointDecodingInputEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getjointdecodingoutput (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getJointDecodingOutputEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getlogprobs (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getmaxbeamwidth (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getmaxdecodingdecodertokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState27getMaxDecodingDecoderTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getmaxdecodingenginetokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getMaxDecodingEngineTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getmaxsequencelength (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getMaxSequenceLengthEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getnextdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getNextDraftTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getnextdrafttokenslengths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getNextDraftTokensLengthsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getnumdecodingenginetokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getparentids (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState12getParentIdsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getprevdrafttokenslengths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getPrevDraftTokensLengthsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getsequencelengths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getspeculativedecodingmode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getSpeculativeDecodingModeEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::llmrequestptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13LlmRequestPtrE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mbeamsearchbuffers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState18mBeamSearchBuffersE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mfinishedsteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState14mFinishedStepsE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mjointdecodinginput (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState19mJointDecodingInputE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mjointdecodingoutput (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState20mJointDecodingOutputE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxbatchsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13mMaxBatchSizeE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxbeamwidth (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13mMaxBeamWidthE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxdecodingdecodertokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState25mMaxDecodingDecoderTokensE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxdecodingenginetokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mMaxDecodingEngineTokensE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxsequencelength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState18mMaxSequenceLengthE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mnumdecodingenginetokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mNumDecodingEngineTokensE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mspeculativedecodingmode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mSpeculativeDecodingModeE", false]], "tensorrt_llm::runtime::decoder::decoderstate::requestvector (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13RequestVectorE", false]], "tensorrt_llm::runtime::decoder::decoderstate::setnumdecodingenginetokens (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState26setNumDecodingEngineTokensE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::decoder::decoderstate::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", false]], "tensorrt_llm::runtime::decoder::decoderstate::setupeagle (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState10setupEagleEN12EagleBuffers6InputsE", false]], "tensorrt_llm::runtime::decoder::decoderstate::setupexplicitdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24setupExplicitDraftTokensEN26ExplicitDraftTokensBuffers6InputsE", false]], "tensorrt_llm::runtime::decoder::decoderstate::setuplookahead (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14setupLookaheadE24LookaheadDecodingBuffers", false]], "tensorrt_llm::runtime::decoder::decoderstate::setupspeculativedecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", false]], "tensorrt_llm::runtime::decoder::decoderstate::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState9TensorPtrE", false]], "tensorrt_llm::runtime::decoder_batch (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batchE", false]], "tensorrt_llm::runtime::decoder_batch::input (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE", false]], "tensorrt_llm::runtime::decoder_batch::input::batchslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input10batchSlotsE", false]], "tensorrt_llm::runtime::decoder_batch::input::batchslotsrequestorder (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input22batchSlotsRequestOrderE", false]], "tensorrt_llm::runtime::decoder_batch::input::cacheindirection (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input16cacheIndirectionE", false]], "tensorrt_llm::runtime::decoder_batch::input::eagleinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input11eagleInputsE", false]], "tensorrt_llm::runtime::decoder_batch::input::eaglelastinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15eagleLastInputsE", false]], "tensorrt_llm::runtime::decoder_batch::input::explicitdrafttokensinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input25explicitDraftTokensInputsE", false]], "tensorrt_llm::runtime::decoder_batch::input::explicitdrafttokenslastinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input29explicitDraftTokensLastInputsE", false]], "tensorrt_llm::runtime::decoder_batch::input::generationsteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15generationStepsE", false]], "tensorrt_llm::runtime::decoder_batch::input::input (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorI14TensorConstPtrEE", false], [1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorINSt6vectorI14TensorConstPtrEEEE10SizeType32", false]], "tensorrt_llm::runtime::decoder_batch::input::logits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input6logitsE", false]], "tensorrt_llm::runtime::decoder_batch::input::maxdecodersteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15maxDecoderStepsE", false]], "tensorrt_llm::runtime::decoder_batch::input::predicteddraftlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input20predictedDraftLogitsE", false]], "tensorrt_llm::runtime::decoder_batch::input::tensorconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input14TensorConstPtrE", false]], "tensorrt_llm::runtime::decoder_batch::input::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input9TensorPtrE", false]], "tensorrt_llm::runtime::decoder_batch::output (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE", false]], "tensorrt_llm::runtime::decoder_batch::output::cacheindirection (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output16cacheIndirectionE", false]], "tensorrt_llm::runtime::decoder_batch::output::output (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output6OutputEv", false]], "tensorrt_llm::runtime::decoder_batch::output::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output9TensorPtrE", false]], "tensorrt_llm::runtime::decoder_batch::request (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7RequestE", false]], "tensorrt_llm::runtime::decoder_batch::request::badwordslist (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request12badWordsListE", false]], "tensorrt_llm::runtime::decoder_batch::request::bufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request9BufferPtrE", false]], "tensorrt_llm::runtime::decoder_batch::request::draftlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11draftLogitsE", false]], "tensorrt_llm::runtime::decoder_batch::request::drafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11draftTokensE", false]], "tensorrt_llm::runtime::decoder_batch::request::dtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request5dtypeE", false]], "tensorrt_llm::runtime::decoder_batch::request::eagleconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11eagleConfigE", false]], "tensorrt_llm::runtime::decoder_batch::request::embeddingbias (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13embeddingBiasE", false]], "tensorrt_llm::runtime::decoder_batch::request::endid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request5endIdE", false]], "tensorrt_llm::runtime::decoder_batch::request::generatedtokensperenginestep (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request28generatedTokensPerEngineStepE", false]], "tensorrt_llm::runtime::decoder_batch::request::ids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request3idsE", false]], "tensorrt_llm::runtime::decoder_batch::request::inputlen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request8inputLenE", false]], "tensorrt_llm::runtime::decoder_batch::request::lookaheadruntimeconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request22lookaheadRuntimeConfigE", false]], "tensorrt_llm::runtime::decoder_batch::request::maxnewtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request12maxNewTokensE", false]], "tensorrt_llm::runtime::decoder_batch::request::medusapaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11medusaPathsE", false]], "tensorrt_llm::runtime::decoder_batch::request::medusatreeids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13medusaTreeIdsE", false]], "tensorrt_llm::runtime::decoder_batch::request::request (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", false]], "tensorrt_llm::runtime::decoder_batch::request::stopwordslist (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13stopWordsListE", false]], "tensorrt_llm::runtime::decoder_batch::request::tensorconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request14TensorConstPtrE", false]], "tensorrt_llm::runtime::decoder_batch::request::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request9TensorPtrE", false]], "tensorrt_llm::runtime::decodinginput (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInputE", false]], "tensorrt_llm::runtime::decodinginput::badwordslens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12badWordsLensE", false]], "tensorrt_llm::runtime::decodinginput::badwordslists (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13badWordsListsE", false]], "tensorrt_llm::runtime::decodinginput::badwordsptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12badWordsPtrsE", false]], "tensorrt_llm::runtime::decodinginput::batchsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9batchSizeE", false]], "tensorrt_llm::runtime::decodinginput::batchslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput10batchSlotsE", false]], "tensorrt_llm::runtime::decodinginput::beamwidths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput10beamWidthsE", false]], "tensorrt_llm::runtime::decodinginput::cacheindirection (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput16cacheIndirectionE", false]], "tensorrt_llm::runtime::decodinginput::decodinginput (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11eagleInputsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::acceptedlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs12acceptedLensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::acceptedpathids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15acceptedPathIdsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::acceptedtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14acceptedTokensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::chunkedcontextnexttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs24chunkedContextNextTokensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::eagleinputs (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::lastdraftlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs13lastDraftLensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::lastdraftpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14lastDraftPathsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::lastdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15lastDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::nextdraftlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs13nextDraftLensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::nextdraftpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14nextDraftPathsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::seqslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs8seqSlotsE", false]], "tensorrt_llm::runtime::decodinginput::embeddingbias (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13embeddingBiasE", false]], "tensorrt_llm::runtime::decodinginput::endids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput6endIdsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25explicitDraftTokensInputsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::bestpathindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15bestPathIndicesE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::bestpathlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15bestPathLengthsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::lastdraftindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs16lastDraftIndicesE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::lastdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15lastDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::lastgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs21lastGenerationLengthsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::lastpositionidsbase (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs19lastPositionIdsBaseE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::masks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs5masksE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::maxgenlengthdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs18maxGenLengthDeviceE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextdraftindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs16nextDraftIndicesE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextdraftprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs14nextDraftProbsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextflattokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs14nextFlatTokensE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs21nextGenerationLengthsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::packedpositionids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs17packedPositionIdsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::seqslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs8seqSlotsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25externalDraftTokensInputsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::constantthreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs17constantThresholdE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::draftlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs11draftLogitsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::draftprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs10draftProbsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::drafttokenids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs13draftTokenIdsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::numdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs14numDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::numdrafttokenshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs18numDraftTokensHostE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::step (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs4stepE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::targetprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs11targetProbsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::usedraftlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs14useDraftLogitsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::usedraftlogitshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs18useDraftLogitsHostE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::userandomacceptancethreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs28useRandomAcceptanceThresholdE", false]], "tensorrt_llm::runtime::decodinginput::finishreasons (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13finishReasonsE", false]], "tensorrt_llm::runtime::decodinginput::generationsteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15generationStepsE", false]], "tensorrt_llm::runtime::decodinginput::lengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput7lengthsE", false]], "tensorrt_llm::runtime::decodinginput::logits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput6logitsE", false]], "tensorrt_llm::runtime::decodinginput::logitsvec (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9logitsVecE", false]], "tensorrt_llm::runtime::decodinginput::lookaheadinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15lookaheadInputsE", false]], "tensorrt_llm::runtime::decodinginput::lookaheadinputs (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15LookaheadInputsE", false]], "tensorrt_llm::runtime::decodinginput::lookaheadinputs::tokensperstep (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15LookaheadInputs13tokensPerStepE", false]], "tensorrt_llm::runtime::decodinginput::maxattentionwindow (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput18maxAttentionWindowE", false]], "tensorrt_llm::runtime::decodinginput::maxbadwordslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14maxBadWordsLenE", false]], "tensorrt_llm::runtime::decodinginput::maxlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9maxLengthE", false]], "tensorrt_llm::runtime::decodinginput::maxstopwordslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15maxStopWordsLenE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputsE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12medusaInputsE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusacurtokensperstep (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs22medusaCurTokensPerStepE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusalogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs12medusaLogitsE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusapaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs11medusaPathsE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusatargettokensperstep (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs25medusaTargetTokensPerStepE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusatreeids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs13medusaTreeIdsE", false]], "tensorrt_llm::runtime::decodinginput::norepeatngramsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput17noRepeatNgramSizeE", false]], "tensorrt_llm::runtime::decodinginput::sequencelimitlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput19sequenceLimitLengthE", false]], "tensorrt_llm::runtime::decodinginput::sinktokenlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15sinkTokenLengthE", false]], "tensorrt_llm::runtime::decodinginput::step (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput4stepE", false]], "tensorrt_llm::runtime::decodinginput::stopwordslens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13stopWordsLensE", false]], "tensorrt_llm::runtime::decodinginput::stopwordslists (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14stopWordsListsE", false]], "tensorrt_llm::runtime::decodinginput::stopwordsptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13stopWordsPtrsE", false]], "tensorrt_llm::runtime::decodinginput::tensorconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14TensorConstPtrE", false]], "tensorrt_llm::runtime::decodinginput::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9TensorPtrE", false]], "tensorrt_llm::runtime::decodingoutput (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutputE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypothesesE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14beamHypothesesE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::batchdones (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses10batchDonesE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::cumlogprobscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses14cumLogProbsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::empty (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5emptyERK13BufferManager", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::init (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses4initERK13BufferManager11TokenIdType", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::logprobscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses11logProbsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::minnormedscorescba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses18minNormedScoresCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::normedscorescba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses15normedScoresCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::numbeamscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses11numBeamsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::outputidscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses12outputIdsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::release (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7releaseEv", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::sequencelengthscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses18sequenceLengthsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::slice (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5sliceE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::decodingoutput::cacheindirection (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput16cacheIndirectionE", false]], "tensorrt_llm::runtime::decodingoutput::cumlogprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11cumLogProbsE", false]], "tensorrt_llm::runtime::decodingoutput::decodingoutput (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14DecodingOutputE9TensorPtr9TensorPtr", false]], "tensorrt_llm::runtime::decodingoutput::eaglebuffers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput12eagleBuffersE", false]], "tensorrt_llm::runtime::decodingoutput::explicitdrafttokensbuffers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26explicitDraftTokensBuffersE", false]], "tensorrt_llm::runtime::decodingoutput::finishedsum (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11finishedSumE", false]], "tensorrt_llm::runtime::decodingoutput::finishreasons (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput13finishReasonsE", false]], "tensorrt_llm::runtime::decodingoutput::gatheredids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11gatheredIdsE", false]], "tensorrt_llm::runtime::decodingoutput::ids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput3idsE", false]], "tensorrt_llm::runtime::decodingoutput::knegativeinfinity (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput17kNegativeInfinityE", false]], "tensorrt_llm::runtime::decodingoutput::lengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput7lengthsE", false]], "tensorrt_llm::runtime::decodingoutput::logprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput8logProbsE", false]], "tensorrt_llm::runtime::decodingoutput::logprobstiled (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput13logProbsTiledE", false]], "tensorrt_llm::runtime::decodingoutput::lookaheadoutputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput16lookaheadOutputsE", false]], "tensorrt_llm::runtime::decodingoutput::newtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9newTokensE", false]], "tensorrt_llm::runtime::decodingoutput::newtokenssteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14newTokensStepsE", false]], "tensorrt_llm::runtime::decodingoutput::newtokensvec (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput12newTokensVecE", false]], "tensorrt_llm::runtime::decodingoutput::parentids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9parentIdsE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputsE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26speculativeDecodingOutputsE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::acceptedlengthscumsum (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs21acceptedLengthsCumSumE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::acceptedtokenslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs17acceptedTokensLenE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::nextdrafttokenslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs18nextDraftTokensLenE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::pathsoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs12pathsOffsetsE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::prevdrafttokenslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs18prevDraftTokensLenE", false]], "tensorrt_llm::runtime::decodingoutput::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9TensorPtrE", false]], "tensorrt_llm::runtime::deviceallocationnvls (c++ class)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime20DeviceAllocationNvlsE", false]], "tensorrt_llm::runtime::deviceallocationnvls::_capacity (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls9_capacityE", false]], "tensorrt_llm::runtime::deviceallocationnvls::_handle (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls7_handleE", false]], "tensorrt_llm::runtime::deviceallocationnvls::deviceallocationnvls (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls20DeviceAllocationNvlsEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::free (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls4freeEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::getcapacity (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls11getCapacityEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::getipcunicastpointers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls21getIpcUnicastPointersEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::getmulticastpointer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls19getMulticastPointerEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::getunicastpointer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls17getUnicastPointerEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::reset (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls5resetE6size_tNSt3setIiEE", false]], "tensorrt_llm::runtime::deviceallocationnvls::~deviceallocationnvls (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvlsD0Ev", false]], "tensorrt_llm::runtime::eaglebuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffersE", false]], "tensorrt_llm::runtime::eaglebuffers::bufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9BufferPtrE", false]], "tensorrt_llm::runtime::eaglebuffers::chunkedcontextnexttokenshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers28chunkedContextNextTokensHostE", false]], "tensorrt_llm::runtime::eaglebuffers::cumsumgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers23cumSumGenerationLengthsE", false]], "tensorrt_llm::runtime::eaglebuffers::eaglebuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", false]], "tensorrt_llm::runtime::eaglebuffers::engineinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12engineInputsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13engineOutputsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::acceptedlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs12acceptedLensE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::acceptedpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs13acceptedPathsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::acceptedtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs14acceptedTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::chunkedcontextnexttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs24chunkedContextNextTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::nextdraftlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs13nextDraftLensE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::nextdraftpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs14nextDraftPathsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::greedysamplinghost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers18greedySamplingHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6InputsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::alllayersdrafttokenids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs22allLayersDraftTokenIdsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::alllayersdrafttokenidspredecessor (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs33allLayersDraftTokenIdsPredecessorE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::alllayersscores (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs15allLayersScoresE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::chunkedcontextnexttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs24chunkedContextNextTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::create (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::currentexpandindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs20currentExpandIndicesE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::draftlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs9draftLensE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::draftpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs10draftPathsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::draftpathshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs14draftPathsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::drafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs11draftTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::dynamictreemaxtopkhost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs22dynamicTreeMaxTopKHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetctxcontextlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29eagleNetCtxContextLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetctxpastkeyvaluelengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs34eagleNetCtxPastKeyValueLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetctxrequesttypeshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27eagleNetCtxRequestTypesHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetgencontextlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29eagleNetGenContextLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetgenpastkeyvaluelengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs34eagleNetGenPastKeyValueLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetgenrequesttypeshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27eagleNetGenRequestTypesHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::inputgentokenshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18inputGenTokensHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::posterioralpha (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs14posteriorAlphaE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::posteriorthreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18posteriorThresholdE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::prevscores (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs10prevScoresE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::randomdatasample (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs16randomDataSampleE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::randomdatavalidation (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs20randomDataValidationE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::specdecodinggenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29specDecodingGenerationLengthsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::specdecodinggenerationlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs33specDecodingGenerationLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::specdecodingpackedmasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs23specDecodingPackedMasksE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::specdecodingpositionoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27specDecodingPositionOffsetsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::temperatures (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs12temperaturesE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::usedynamictreehost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18useDynamicTreeHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::usespecdecoding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs15useSpecDecodingE", false]], "tensorrt_llm::runtime::eaglebuffers::insertinputtensors (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::eaglebuffers::itensor (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7ITensorE", false]], "tensorrt_llm::runtime::eaglebuffers::llmrequestptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13LlmRequestPtrE", false]], "tensorrt_llm::runtime::eaglebuffers::maxgenerationlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers19maxGenerationLengthE", false]], "tensorrt_llm::runtime::eaglebuffers::mdefaultposteriorthreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers26mDefaultPosteriorThresholdE", false]], "tensorrt_llm::runtime::eaglebuffers::mdogreedysampling (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers17mDoGreedySamplingE", false]], "tensorrt_llm::runtime::eaglebuffers::posterioralphahost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers18posteriorAlphaHostE", false]], "tensorrt_llm::runtime::eaglebuffers::posteriorthresholdhost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers22posteriorThresholdHostE", false]], "tensorrt_llm::runtime::eaglebuffers::requestvector (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13RequestVectorE", false]], "tensorrt_llm::runtime::eaglebuffers::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", false]], "tensorrt_llm::runtime::eaglebuffers::scanreducetempstorage (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers21scanReduceTempStorageE", false]], "tensorrt_llm::runtime::eaglebuffers::scanreducetempstoragebytes (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers26scanReduceTempStorageBytesE", false]], "tensorrt_llm::runtime::eaglebuffers::setfrominputs (c++ function)": [[1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", false], [1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::eaglebuffers::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers10SizeType32E", false]], "tensorrt_llm::runtime::eaglebuffers::tensormap (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9TensorMapE", false]], "tensorrt_llm::runtime::eaglebuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::eaglemodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModuleE", false]], "tensorrt_llm::runtime::eaglemodule::eaglemodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleEv", false]], "tensorrt_llm::runtime::eaglemodule::getdefaulteaglechoices (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule22getDefaultEagleChoicesEv", false]], "tensorrt_llm::runtime::eaglemodule::getmaxnonleafnodesperlayer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule26getMaxNonLeafNodesPerLayerEv", false]], "tensorrt_llm::runtime::eaglemodule::getnumtransformerlayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule23getNumTransformerLayersEv", false]], "tensorrt_llm::runtime::eaglemodule::mdefaulteaglechoices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModule20mDefaultEagleChoicesE", false]], "tensorrt_llm::runtime::eaglemodule::mmaxnonleafnodesperlayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModule24mMaxNonLeafNodesPerLayerE", false]], "tensorrt_llm::runtime::eaglemodule::mnumtransformerslayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModule21mNumTransformersLayerE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffersE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::bufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9BufferPtrE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::cumsumgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers23cumSumGenerationLengthsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineinputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12engineInputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineinputs::positionoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputs15positionOffsetsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineinputs::requesttypesdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputs18requestTypesDeviceE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13engineOutputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::bestpathindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15bestPathIndicesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::bestpathlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15bestPathLengthsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::masks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs5masksE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::maxgentoken (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs11maxGenTokenE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextdraftindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs16nextDraftIndicesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextdraftprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs14nextDraftProbsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextflattokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs14nextFlatTokensE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs21nextGenerationLengthsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextpositionoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs19nextPositionOffsetsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::packedpositionids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs17packedPositionIdsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::totalgentoken (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs13totalGenTokenE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::explicitdrafttokensbuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6InputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::create (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::draftindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs12draftIndicesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::draftprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs10draftProbsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::drafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11draftTokensE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::generationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs17generationLengthsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::generationlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs21generationLengthsHostE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::maxgenlengthhost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs16maxGenLengthHostE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::packedmasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11packedMasksE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::positionids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11positionIdsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::positionidsbase (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs15positionIdsBaseE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::randomdatasample (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs16randomDataSampleE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::randomdatavalidation (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs20randomDataValidationE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::temperatures (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs12temperaturesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::usespecdecoding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs15useSpecDecodingE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::insertinputtensors (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::itensor (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7ITensorE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::scantempstorage (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers15scanTempStorageE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::scantempstoragebytes (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers20scanTempStorageBytesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::setfrominputs (c++ function)": [[1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", false], [1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers10SizeType32E", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::tensormap (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9TensorMapE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::genericprompttuningparams (c++ class)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime25GenericPromptTuningParamsE", false]], "tensorrt_llm::runtime::genericprompttuningparams::embeddingtable (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams14embeddingTableE", false]], "tensorrt_llm::runtime::genericprompttuningparams::genericprompttuningparams (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", false]], "tensorrt_llm::runtime::genericprompttuningparams::prompttuningenabled (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams19promptTuningEnabledE", false]], "tensorrt_llm::runtime::genericprompttuningparams::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams10SizeType32E", false]], "tensorrt_llm::runtime::genericprompttuningparams::tasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams5tasksE", false]], "tensorrt_llm::runtime::genericprompttuningparams::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams9TensorPtrE", false]], "tensorrt_llm::runtime::genericprompttuningparams::vocabsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams9vocabSizeE", false]], "tensorrt_llm::runtime::getdefaultbatchslots (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20getDefaultBatchSlotsEN7runtime10SizeType32E", false]], "tensorrt_llm::runtime::gptdecoder (c++ class)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime10GptDecoderE", false]], "tensorrt_llm::runtime::gptdecoder::cudastreamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13CudaStreamPtrE", false]], "tensorrt_llm::runtime::gptdecoder::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", false]], "tensorrt_llm::runtime::gptdecoder::forwardasync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", false]], "tensorrt_llm::runtime::gptdecoder::forwardsync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", false]], "tensorrt_llm::runtime::gptdecoder::getsamplingconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder17getSamplingConfigEv", false]], "tensorrt_llm::runtime::gptdecoder::gptdecoder (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", false]], "tensorrt_llm::runtime::gptdecoder::mdecodinglayerworkspace (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder23mDecodingLayerWorkspaceE", false]], "tensorrt_llm::runtime::gptdecoder::mdecodingmode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13mDecodingModeE", false]], "tensorrt_llm::runtime::gptdecoder::mdynamicdecodelayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder19mDynamicDecodeLayerE", false]], "tensorrt_llm::runtime::gptdecoder::mmanager (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder8mManagerE", false]], "tensorrt_llm::runtime::gptdecoder::mmaxbatchsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13mMaxBatchSizeE", false]], "tensorrt_llm::runtime::gptdecoder::msamplingconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder15mSamplingConfigE", false]], "tensorrt_llm::runtime::gptdecoder::mvocabsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10mVocabSizeE", false]], "tensorrt_llm::runtime::gptdecoder::mvocabsizepadded (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16mVocabSizePaddedE", false]], "tensorrt_llm::runtime::gptdecoder::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", false]], "tensorrt_llm::runtime::gptdecoder::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder9TensorPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatchedE", false]], "tensorrt_llm::runtime::gptdecoderbatched::cudastreamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13CudaStreamPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", false]], "tensorrt_llm::runtime::gptdecoderbatched::finalize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", false]], "tensorrt_llm::runtime::gptdecoderbatched::forward (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::forwardasync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::forwarddispatch (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::getbuffermanager (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getBufferManagerEv", false]], "tensorrt_llm::runtime::gptdecoderbatched::getdecoderstate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv", false], [1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv", false]], "tensorrt_llm::runtime::gptdecoderbatched::getdecoderstream (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getDecoderStreamEv", false]], "tensorrt_llm::runtime::gptdecoderbatched::getunderlyingdecoder (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched20getUnderlyingDecoderEv", false]], "tensorrt_llm::runtime::gptdecoderbatched::gptdecoderbatched (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::gptdecoderbatched::gptdecoderptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13GptDecoderPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched::llmrequestptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13LlmRequestPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mbuffermanager (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mBufferManagerE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mdecoder (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched8mDecoderE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mdecoderstate (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13mDecoderStateE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mdecoderstream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mDecoderStreamE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mruntimestream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mRuntimeStreamE", false]], "tensorrt_llm::runtime::gptdecoderbatched::prepareforward (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::requestvector (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13RequestVectorE", false]], "tensorrt_llm::runtime::gptdecoderbatched::seteagleinputs (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::setexplicitdrafttokensinputs (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", false]], "tensorrt_llm::runtime::gptdecoderbatched::sharedconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14SharedConstPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched9TensorPtrE", false]], "tensorrt_llm::runtime::gptjsonconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfigE", false]], "tensorrt_llm::runtime::gptjsonconfig::enginefilename (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfig", false], [1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfigRKNSt6stringE", false]], "tensorrt_llm::runtime::gptjsonconfig::getcontextparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig21getContextParallelismEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getgpuspernode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14getGpusPerNodeEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getmodelconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14getModelConfigEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getmodelconfigmutable (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig21getModelConfigMutableEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getname (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig7getNameEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getpipelineparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig22getPipelineParallelismEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getprecision (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig12getPrecisionEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getruntimedefaults (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig18getRuntimeDefaultsEv", false]], "tensorrt_llm::runtime::gptjsonconfig::gettensorparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig20getTensorParallelismEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getversion (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig10getVersionEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getworldsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig12getWorldSizeEv", false]], "tensorrt_llm::runtime::gptjsonconfig::gptjsonconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", false]], "tensorrt_llm::runtime::gptjsonconfig::mcontextparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig19mContextParallelismE", false]], "tensorrt_llm::runtime::gptjsonconfig::mgpuspernode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig12mGpusPerNodeE", false]], "tensorrt_llm::runtime::gptjsonconfig::mmodelconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig12mModelConfigE", false]], "tensorrt_llm::runtime::gptjsonconfig::mname (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5mNameE", false]], "tensorrt_llm::runtime::gptjsonconfig::mpipelineparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig20mPipelineParallelismE", false]], "tensorrt_llm::runtime::gptjsonconfig::mprecision (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig10mPrecisionE", false]], "tensorrt_llm::runtime::gptjsonconfig::mruntimedefaults (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig16mRuntimeDefaultsE", false]], "tensorrt_llm::runtime::gptjsonconfig::mtensorparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig18mTensorParallelismE", false]], "tensorrt_llm::runtime::gptjsonconfig::mversion (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig8mVersionE", false]], "tensorrt_llm::runtime::gptjsonconfig::parse (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt10filesystem4pathE", false], [1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt6stringE", false], [1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERNSt7istreamE", false]], "tensorrt_llm::runtime::ibuffer (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBufferE", false]], "tensorrt_llm::runtime::ibuffer::data (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataEv", false], [1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", false], [1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataEv", false]], "tensorrt_llm::runtime::ibuffer::datatype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer8DataTypeE", false]], "tensorrt_llm::runtime::ibuffer::getcapacity (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer11getCapacityEv", false]], "tensorrt_llm::runtime::ibuffer::getdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer11getDataTypeEv", false]], "tensorrt_llm::runtime::ibuffer::getdatatypename (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer15getDataTypeNameE8DataType", false], [1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer15getDataTypeNameEv", false]], "tensorrt_llm::runtime::ibuffer::getmemorytype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer13getMemoryTypeEv", false]], "tensorrt_llm::runtime::ibuffer::getmemorytypename (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer17getMemoryTypeNameEv", false]], "tensorrt_llm::runtime::ibuffer::getsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7getSizeEv", false]], "tensorrt_llm::runtime::ibuffer::getsizeinbytes (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer14getSizeInBytesEv", false]], "tensorrt_llm::runtime::ibuffer::ibuffer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7IBufferERK7IBuffer", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7IBufferEv", false]], "tensorrt_llm::runtime::ibuffer::memorytype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer10memoryTypeEPKv", false]], "tensorrt_llm::runtime::ibuffer::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBufferaSERK7IBuffer", false]], "tensorrt_llm::runtime::ibuffer::release (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7releaseEv", false]], "tensorrt_llm::runtime::ibuffer::resize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer6resizeENSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::sharedconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer14SharedConstPtrE", false]], "tensorrt_llm::runtime::ibuffer::sharedptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer9SharedPtrE", false]], "tensorrt_llm::runtime::ibuffer::slice (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::tobytes (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7toBytesENSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::uniqueconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer14UniqueConstPtrE", false]], "tensorrt_llm::runtime::ibuffer::uniqueptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer9UniquePtrE", false]], "tensorrt_llm::runtime::ibuffer::view (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtr", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtrNSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::wrap (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrRNSt6vectorI1TEE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::~ibuffer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBufferD0Ev", false]], "tensorrt_llm::runtime::igptdecoder (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoderE", false]], "tensorrt_llm::runtime::igptdecoder::create (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", false]], "tensorrt_llm::runtime::igptdecoder::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", false]], "tensorrt_llm::runtime::igptdecoder::forwardasync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", false]], "tensorrt_llm::runtime::igptdecoder::forwardsync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", false]], "tensorrt_llm::runtime::igptdecoder::getsamplingconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder17getSamplingConfigEv", false]], "tensorrt_llm::runtime::igptdecoder::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", false]], "tensorrt_llm::runtime::igptdecoder::tensorconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder14TensorConstPtrE", false]], "tensorrt_llm::runtime::igptdecoder::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder9TensorPtrE", false]], "tensorrt_llm::runtime::igptdecoder::~igptdecoder (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoderD0Ev", false]], "tensorrt_llm::runtime::igptdecoderbatched (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatchedE", false]], "tensorrt_llm::runtime::igptdecoderbatched::cudastreamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13CudaStreamPtrE", false]], "tensorrt_llm::runtime::igptdecoderbatched::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", false]], "tensorrt_llm::runtime::igptdecoderbatched::finalize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", false]], "tensorrt_llm::runtime::igptdecoderbatched::forward (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::igptdecoderbatched::forwardasync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::igptdecoderbatched::igptdecoderbatched (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched18IGptDecoderBatchedEv", false]], "tensorrt_llm::runtime::igptdecoderbatched::llmrequestptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13LlmRequestPtrE", false]], "tensorrt_llm::runtime::igptdecoderbatched::requestvector (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13RequestVectorE", false]], "tensorrt_llm::runtime::igptdecoderbatched::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", false]], "tensorrt_llm::runtime::igptdecoderbatched::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched9TensorPtrE", false]], "tensorrt_llm::runtime::igptdecoderbatched::~igptdecoderbatched (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatchedD0Ev", false]], "tensorrt_llm::runtime::ipcmemory (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryE", false]], "tensorrt_llm::runtime::ipcmemory::allocateipcmemory (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", false]], "tensorrt_llm::runtime::ipcmemory::bufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9BufferPtrE", false]], "tensorrt_llm::runtime::ipcmemory::destroyipcmemory (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory16destroyIpcMemoryEv", false]], "tensorrt_llm::runtime::ipcmemory::flags_size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory10FLAGS_SIZEE", false]], "tensorrt_llm::runtime::ipcmemory::getcommptrs (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9IpcMemory11getCommPtrsEv", false]], "tensorrt_llm::runtime::ipcmemory::ipcmemory (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", false], [1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryERK9IpcMemory", false], [1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryERR9IpcMemory", false]], "tensorrt_llm::runtime::ipcmemory::mbuffer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory7mBufferE", false]], "tensorrt_llm::runtime::ipcmemory::mcommptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9mCommPtrsE", false]], "tensorrt_llm::runtime::ipcmemory::mopenipc (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory8mOpenIpcE", false]], "tensorrt_llm::runtime::ipcmemory::mtprank (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory7mTpRankE", false]], "tensorrt_llm::runtime::ipcmemory::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryaSERK9IpcMemory", false], [1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryaSERR9IpcMemory", false]], "tensorrt_llm::runtime::ipcmemory::~ipcmemory (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryD0Ev", false]], "tensorrt_llm::runtime::ipcnvlsallocate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime15ipcNvlsAllocateE6size_tNSt3setIiEE", false]], "tensorrt_llm::runtime::ipcnvlsfree (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ipcNvlsFreeEP13IpcNvlsHandle", false]], "tensorrt_llm::runtime::ipcnvlshandle (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandleE", false]], "tensorrt_llm::runtime::ipcnvlshandle::ipc_uc_handles (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle14ipc_uc_handlesE", false]], "tensorrt_llm::runtime::ipcnvlshandle::ipc_uc_ptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle11ipc_uc_ptrsE", false]], "tensorrt_llm::runtime::ipcnvlshandle::ipc_uc_vas (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle10ipc_uc_vasE", false]], "tensorrt_llm::runtime::ipcnvlshandle::mc_handle (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle9mc_handleE", false]], "tensorrt_llm::runtime::ipcnvlshandle::mc_ptr (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle6mc_ptrE", false]], "tensorrt_llm::runtime::ipcnvlshandle::mc_va (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle5mc_vaE", false]], "tensorrt_llm::runtime::ipcnvlshandle::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle4sizeE", false]], "tensorrt_llm::runtime::ipcnvlshandle::uc_handle (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle9uc_handleE", false]], "tensorrt_llm::runtime::ipcnvlshandle::uc_ptr (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle6uc_ptrE", false]], "tensorrt_llm::runtime::ipcnvlshandle::uc_va (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle5uc_vaE", false]], "tensorrt_llm::runtime::ipcnvlssupported (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime16ipcNvlsSupportedEv", false]], "tensorrt_llm::runtime::itensor (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensorE", false]], "tensorrt_llm::runtime::itensor::at (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRKNSt16initializer_listI9DimType64EE", false]], "tensorrt_llm::runtime::itensor::castsize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor8castSizeE6size_t", false]], "tensorrt_llm::runtime::itensor::dimtype64 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9DimType64E", false]], "tensorrt_llm::runtime::itensor::flattenn (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor8flattenNE9SharedPtrNSt7int64_tE", false]], "tensorrt_llm::runtime::itensor::getdimension (c++ function)": [[1, "_CPPv4I_10SizeType32ENK12tensorrt_llm7runtime7ITensor12getDimensionE9DimType64v", false]], "tensorrt_llm::runtime::itensor::getshape (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7ITensor8getShapeEv", false]], "tensorrt_llm::runtime::itensor::itensor (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor7ITensorERK7ITensor", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor7ITensorEv", false]], "tensorrt_llm::runtime::itensor::makeshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9makeShapeERKNSt16initializer_listI9DimType64EE", false]], "tensorrt_llm::runtime::itensor::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensoraSERK7ITensor", false]], "tensorrt_llm::runtime::itensor::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor7reshapeERK5Shape", false]], "tensorrt_llm::runtime::itensor::resize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor6resizeENSt6size_tE", false]], "tensorrt_llm::runtime::itensor::shape (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor5ShapeE", false]], "tensorrt_llm::runtime::itensor::shapeequals (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", false], [1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor11shapeEqualsERK5ShapeRK5Shape", false], [1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERK5Shape", false], [1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERKNSt16initializer_listI10SizeType32EE", false]], "tensorrt_llm::runtime::itensor::sharedconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor14SharedConstPtrE", false]], "tensorrt_llm::runtime::itensor::sharedptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9SharedPtrE", false]], "tensorrt_llm::runtime::itensor::slice (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", false]], "tensorrt_llm::runtime::itensor::squeeze (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeE10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeERK5Shape10SizeType32", false]], "tensorrt_llm::runtime::itensor::strides (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor7stridesERK5Shape", false]], "tensorrt_llm::runtime::itensor::tensormap (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9TensorMapE", false]], "tensorrt_llm::runtime::itensor::tostring (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor8toStringERK5Shape", false]], "tensorrt_llm::runtime::itensor::uniqueconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor14UniqueConstPtrE", false]], "tensorrt_llm::runtime::itensor::uniqueptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9UniquePtrE", false]], "tensorrt_llm::runtime::itensor::unsqueeze (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeE10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeERK5Shape10SizeType32", false]], "tensorrt_llm::runtime::itensor::view (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewE9SharedPtr", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewEN7IBuffer9SharedPtrERK5Shape", false]], "tensorrt_llm::runtime::itensor::volume (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor6volumeERK5Shape", false]], "tensorrt_llm::runtime::itensor::volumenonnegative (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor17volumeNonNegativeERK5Shape", false]], "tensorrt_llm::runtime::itensor::wrap (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", false]], "tensorrt_llm::runtime::itensor::~itensor (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensorD0Ev", false]], "tensorrt_llm::runtime::lamportinitializeall (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffersE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::generationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers17generationLengthsE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::lookaheaddecodingbuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::packedmasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers11packedMasksE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::positionids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers11positionIdsE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::positionoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers15positionOffsetsE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::lookaheadmodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModuleE", false]], "tensorrt_llm::runtime::lookaheadmodule::getexecutionconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime15LookaheadModule18getExecutionConfigEv", false]], "tensorrt_llm::runtime::lookaheadmodule::lookaheadmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleE10SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleEv", false]], "tensorrt_llm::runtime::lookaheadmodule::mexecutionconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule16mExecutionConfigE", false]], "tensorrt_llm::runtime::lookaheadmodule::setexecutionconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule18setExecutionConfigERKN8executor23LookaheadDecodingConfigE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffersE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::batchslotshostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers18batchSlotsHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::cumsumlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers12cumSumLengthE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::disablelookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers24disableLookaheadDecodingEv", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::enablelookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23enableLookaheadDecodingE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::generationlengthsdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23generationLengthsDeviceE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::generationlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers21generationLengthsHostE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::generationlengthshostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers25generationLengthsHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::insertinputtensors (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::lookaheadruntimebuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::packedmaskhost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers14packedMaskHostE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::packedmaskhostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers18packedMaskHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::packedmasksdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers17packedMasksDeviceE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionidsdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers17positionIdsDeviceE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionidshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers15positionIdsHostE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionidshostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers19positionIdsHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionoffsetsdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers21positionOffsetsDeviceE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionoffsetshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers19positionOffsetsHostE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionoffsetshostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23positionOffsetsHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::setfrominputs (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::tensormap (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers9TensorMapE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::usespecdecoding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers15useSpecDecodingE", false]], "tensorrt_llm::runtime::loracache (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCacheE", false]], "tensorrt_llm::runtime::loracache::bump (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache4bumpE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::bumptaskinprogress (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18bumpTaskInProgressE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::claimpageswithevict (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache19claimPagesWithEvictE10SizeType32", false]], "tensorrt_llm::runtime::loracache::copytask (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", false]], "tensorrt_llm::runtime::loracache::copytaskmappages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", false]], "tensorrt_llm::runtime::loracache::copytopages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", false]], "tensorrt_llm::runtime::loracache::determinenumpages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE10TaskIdType", false], [1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE9TensorPtr", false]], "tensorrt_llm::runtime::loracache::fits (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache4fitsE9TensorPtr", false]], "tensorrt_llm::runtime::loracache::get (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3getE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::getnumpages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache11getNumPagesEv", false]], "tensorrt_llm::runtime::loracache::getpageptr (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache10getPagePtrE6size_t", false]], "tensorrt_llm::runtime::loracache::getstatus (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache9getStatusE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::has (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache3hasE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::isdone (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache6isDoneE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::isloaded (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache8isLoadedE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::loadweights (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", false], [1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", false]], "tensorrt_llm::runtime::loracache::loracache (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", false]], "tensorrt_llm::runtime::loracache::markalldone (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11markAllDoneEv", false]], "tensorrt_llm::runtime::loracache::marktaskdone (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12markTaskDoneE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::mbuffermanager (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache14mBufferManagerE", false]], "tensorrt_llm::runtime::loracache::mcachemap (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9mCacheMapE", false]], "tensorrt_llm::runtime::loracache::mcachemutex (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11mCacheMutexE", false]], "tensorrt_llm::runtime::loracache::mcachepagemanager (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17mCachePageManagerE", false]], "tensorrt_llm::runtime::loracache::mdevicebuffermanagers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21mDeviceBufferManagersE", false]], "tensorrt_llm::runtime::loracache::mdonetasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache10mDoneTasksE", false]], "tensorrt_llm::runtime::loracache::minprogresstasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16mInProgressTasksE", false]], "tensorrt_llm::runtime::loracache::mmodelconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12mModelConfigE", false]], "tensorrt_llm::runtime::loracache::mmoduleidtomodule (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17mModuleIdToModuleE", false]], "tensorrt_llm::runtime::loracache::mpagemanagerconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18mPageManagerConfigE", false]], "tensorrt_llm::runtime::loracache::mpagesmutex (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11mPagesMutexE", false]], "tensorrt_llm::runtime::loracache::mworldconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12mWorldConfigE", false]], "tensorrt_llm::runtime::loracache::put (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", false]], "tensorrt_llm::runtime::loracache::splittransposecpu (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loracache::splittransposecpuinner (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loracache::taskidtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache10TaskIdTypeE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::adaptersize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig11adapterSizeE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::insize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig6inSizeE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::layerid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7layerIdE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::moduleid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8moduleIdE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::numslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8numSlotsE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::operator== (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigeqERKN9LoraCache21TaskLayerModuleConfigE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::outsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7outSizeE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::pageid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig6pageIdE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::scalingvecpointer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig17scalingVecPointerE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::slotidx (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7slotIdxE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::tostring (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8toStringEv", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::weightsinpointer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig16weightsInPointerE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::weightsoutpointer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig17weightsOutPointerE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfiglistptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache28TaskLayerModuleConfigListPtrE", false]], "tensorrt_llm::runtime::loracache::taskvalue (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueE", false]], "tensorrt_llm::runtime::loracache::taskvalue::configs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue7configsE", false]], "tensorrt_llm::runtime::loracache::taskvalue::done (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue4doneE", false]], "tensorrt_llm::runtime::loracache::taskvalue::inprogress (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue10inProgressE", false]], "tensorrt_llm::runtime::loracache::taskvalue::it (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue2itE", false]], "tensorrt_llm::runtime::loracache::taskvalue::loaded (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue6loadedE", false]], "tensorrt_llm::runtime::loracache::taskvalue::loadinprogress (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue14loadInProgressE", false]], "tensorrt_llm::runtime::loracache::taskvalue::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueaSERR9TaskValue", false]], "tensorrt_llm::runtime::loracache::taskvalue::pageids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue7pageIdsE", false]], "tensorrt_llm::runtime::loracache::taskvalue::taskvalue (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", false], [1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERR9TaskValue", false], [1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueEv", false]], "tensorrt_llm::runtime::loracache::taskvalue::~taskvalue (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueD0Ev", false]], "tensorrt_llm::runtime::loracache::taskvalueptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12TaskValuePtrE", false]], "tensorrt_llm::runtime::loracache::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TensorPtrE", false]], "tensorrt_llm::runtime::loracache::valuestatus (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatusE", false]], "tensorrt_llm::runtime::loracache::valuestatus::kvalue_status_loaded (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus20kVALUE_STATUS_LOADEDE", false]], "tensorrt_llm::runtime::loracache::valuestatus::kvalue_status_missing (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus21kVALUE_STATUS_MISSINGE", false]], "tensorrt_llm::runtime::loracache::valuestatus::kvalue_status_processing (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus24kVALUE_STATUS_PROCESSINGE", false]], "tensorrt_llm::runtime::loracachefullexception (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullExceptionE", false]], "tensorrt_llm::runtime::loracachefullexception::loracachefullexception (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullException22LoraCacheFullExceptionERKNSt6stringE", false]], "tensorrt_llm::runtime::loracachefullexception::~loracachefullexception (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullExceptionD0Ev", false]], "tensorrt_llm::runtime::loracachepagemanager (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManagerE", false]], "tensorrt_llm::runtime::loracachepagemanager::blockptr (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager8blockPtrE10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanager::claimpages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10claimPagesE10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanager::initialize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10initializeERK13BufferManager", false]], "tensorrt_llm::runtime::loracachepagemanager::loracachepagemanager (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager20LoraCachePageManagerERK26LoraCachePageManagerConfigRK13BufferManager", false]], "tensorrt_llm::runtime::loracachepagemanager::mconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager7mConfigE", false]], "tensorrt_llm::runtime::loracachepagemanager::mfreepageids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12mFreePageIdsE", false]], "tensorrt_llm::runtime::loracachepagemanager::mispagefree (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager11mIsPageFreeE", false]], "tensorrt_llm::runtime::loracachepagemanager::mpageblocks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager11mPageBlocksE", false]], "tensorrt_llm::runtime::loracachepagemanager::mutablepageptr (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager14mutablePagePtrENSt6size_tE", false]], "tensorrt_llm::runtime::loracachepagemanager::numavailablepages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager17numAvailablePagesEv", false]], "tensorrt_llm::runtime::loracachepagemanager::pageptr (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager7pagePtrENSt6size_tE", false]], "tensorrt_llm::runtime::loracachepagemanager::releasepages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12releasePagesERKNSt6vectorINSt6size_tEEE", false]], "tensorrt_llm::runtime::loracachepagemanager::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager9TensorPtrE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfigE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig11getDataTypeEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getinittozero (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig13getInitToZeroEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getmaxpagesperblock (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig19getMaxPagesPerBlockEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getmemorytype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig13getMemoryTypeEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getnumcopystreams (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig17getNumCopyStreamsEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getpagewidth (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig12getPageWidthEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getslotsperpage (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig15getSlotsPerPageEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::gettotalnumpages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig16getTotalNumPagesEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::loracachepagemanagerconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mdatatype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig9mDataTypeE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::minittozero (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11mInitToZeroE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mmaxpagesperblock (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17mMaxPagesPerBlockE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mmemorytype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11mMemoryTypeE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mnumcopystreams (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15mNumCopyStreamsE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mpagewidth (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig10mPageWidthE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mslotsperpage (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13mSlotsPerPageE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mtotalnumpages (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig14mTotalNumPagesE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setdatatype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11setDataTypeERKN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setinittozero (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setInitToZeroEb", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setmaxpagesperblock (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig19setMaxPagesPerBlockERK10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setmemorytype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setMemoryTypeERKN7runtime10MemoryTypeE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setnumcopystreams (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17setNumCopyStreamsE10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setpagewidth (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig12setPageWidthERK10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setslotsperpage (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setSlotsPerPageERK10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::settotalnumpage (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setTotalNumPageERK10SizeType32", false]], "tensorrt_llm::runtime::loraexpectedexception (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedExceptionE", false]], "tensorrt_llm::runtime::loraexpectedexception::loraexpectedexception (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedException21LoraExpectedExceptionERKNSt6stringE", false]], "tensorrt_llm::runtime::loraexpectedexception::~loraexpectedexception (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedExceptionD0Ev", false]], "tensorrt_llm::runtime::loramodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleE", false]], "tensorrt_llm::runtime::loramodule::createloramodules (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::flattenedinoutsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18flattenedInOutSizeE10SizeType32b", false]], "tensorrt_llm::runtime::loramodule::indim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule5inDimEv", false]], "tensorrt_llm::runtime::loramodule::indimfirst (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10inDimFirstEv", false]], "tensorrt_llm::runtime::loramodule::insize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6inSizeE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::intpsplitdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12inTpSplitDimEv", false]], "tensorrt_llm::runtime::loramodule::localinadaptersize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18localInAdapterSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localindim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10localInDimE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::localinoutsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localInOutSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localinsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localInSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localoutadaptersize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule19localOutAdapterSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localoutdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localOutDimE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::localoutsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12localOutSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localscalessize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule15localScalesSizeE10SizeType32b", false]], "tensorrt_llm::runtime::loramodule::localtotalsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", false]], "tensorrt_llm::runtime::loramodule::loramodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10LoraModule", false], [1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleEv", false]], "tensorrt_llm::runtime::loramodule::mindim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule6mInDimE", false]], "tensorrt_llm::runtime::loramodule::mindimfirst (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule11mInDimFirstE", false]], "tensorrt_llm::runtime::loramodule::mintpsplitdim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule13mInTpSplitDimE", false]], "tensorrt_llm::runtime::loramodule::moduletype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleTypeE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_dense (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kATTN_DENSEE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_k (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_KE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_q (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_QE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_qkv (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kATTN_QKVE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_v (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_VE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_dense (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType17kCROSS_ATTN_DENSEE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_k (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_KE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_q (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_QE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_qkv (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType15kCROSS_ATTN_QKVE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_v (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_VE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kinvalid (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType8kINVALIDE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_4h_to_h (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_4H_TO_HE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_gate (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kMLP_GATEE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_gate_up (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_GATE_UPE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_h_to_4h (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_H_TO_4HE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_router (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kMLP_ROUTERE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmoe_4h_to_h (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMOE_4H_TO_HE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmoe_gate (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kMOE_GATEE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmoe_h_to_4h (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMOE_H_TO_4HE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmoe_router (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kMOE_ROUTERE", false]], "tensorrt_llm::runtime::loramodule::moutdim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule7mOutDimE", false]], "tensorrt_llm::runtime::loramodule::moutdimfirst (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12mOutDimFirstE", false]], "tensorrt_llm::runtime::loramodule::mouttpsplitdim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule14mOutTpSplitDimE", false]], "tensorrt_llm::runtime::loramodule::mtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule5mTypeE", false]], "tensorrt_llm::runtime::loramodule::name (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule4nameEv", false]], "tensorrt_llm::runtime::loramodule::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleaSERK10LoraModule", false]], "tensorrt_llm::runtime::loramodule::outdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6outDimEv", false]], "tensorrt_llm::runtime::loramodule::outdimfirst (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11outDimFirstEv", false]], "tensorrt_llm::runtime::loramodule::outsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule7outSizeE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::outtpsplitdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule13outTpSplitDimEv", false]], "tensorrt_llm::runtime::loramodule::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule9TensorPtrE", false]], "tensorrt_llm::runtime::loramodule::tomodulename (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10ModuleType", false], [1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::tomoduletype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleTypeERKNSt11string_viewE", false]], "tensorrt_llm::runtime::loramodule::value (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule5valueEv", false]], "tensorrt_llm::runtime::lorataskidtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14LoraTaskIdTypeE", false]], "tensorrt_llm::runtime::medusamodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModuleE", false]], "tensorrt_llm::runtime::medusamodule::getmedusachoices (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime12MedusaModule16getMedusaChoicesEv", false]], "tensorrt_llm::runtime::medusamodule::mdefaultmedusachoices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule21mDefaultMedusaChoicesE", false]], "tensorrt_llm::runtime::medusamodule::medusachoices (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule13MedusaChoicesE", false]], "tensorrt_llm::runtime::medusamodule::medusamodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleE10SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleEv", false]], "tensorrt_llm::runtime::medusamodule::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule9TensorPtrE", false]], "tensorrt_llm::runtime::memorycounters (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCountersE", false]], "tensorrt_llm::runtime::memorycounters::allocate (c++ function)": [[1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters8allocateEv10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8allocateE10MemoryType10SizeType32", false]], "tensorrt_llm::runtime::memorycounters::bytestostring (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE10SizeType32i", false], [1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE8DiffTypei", false]], "tensorrt_llm::runtime::memorycounters::deallocate (c++ function)": [[1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters10deallocateEv10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10deallocateE10MemoryType10SizeType32", false]], "tensorrt_llm::runtime::memorycounters::difftype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8DiffTypeE", false]], "tensorrt_llm::runtime::memorycounters::getcpu (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getCpuEv", false]], "tensorrt_llm::runtime::memorycounters::getcpudiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getCpuDiffEv", false]], "tensorrt_llm::runtime::memorycounters::getgpu (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getGpuEv", false]], "tensorrt_llm::runtime::memorycounters::getgpudiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getGpuDiffEv", false]], "tensorrt_llm::runtime::memorycounters::getinstance (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11getInstanceEv", false]], "tensorrt_llm::runtime::memorycounters::getpinned (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters9getPinnedEv", false]], "tensorrt_llm::runtime::memorycounters::getpinneddiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters13getPinnedDiffEv", false]], "tensorrt_llm::runtime::memorycounters::getpinnedpool (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters13getPinnedPoolEv", false]], "tensorrt_llm::runtime::memorycounters::getpinnedpooldiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters17getPinnedPoolDiffEv", false]], "tensorrt_llm::runtime::memorycounters::getuvm (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getUVMEv", false]], "tensorrt_llm::runtime::memorycounters::getuvmdiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getUVMDiffEv", false]], "tensorrt_llm::runtime::memorycounters::mcpu (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mCpuE", false]], "tensorrt_llm::runtime::memorycounters::mcpudiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mCpuDiffE", false]], "tensorrt_llm::runtime::memorycounters::memorycounters (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters14MemoryCountersEv", false]], "tensorrt_llm::runtime::memorycounters::mgpu (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mGpuE", false]], "tensorrt_llm::runtime::memorycounters::mgpudiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mGpuDiffE", false]], "tensorrt_llm::runtime::memorycounters::mpinned (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters7mPinnedE", false]], "tensorrt_llm::runtime::memorycounters::mpinneddiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11mPinnedDiffE", false]], "tensorrt_llm::runtime::memorycounters::mpinnedpool (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11mPinnedPoolE", false]], "tensorrt_llm::runtime::memorycounters::mpinnedpooldiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters15mPinnedPoolDiffE", false]], "tensorrt_llm::runtime::memorycounters::muvm (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mUVME", false]], "tensorrt_llm::runtime::memorycounters::muvmdiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mUVMDiffE", false]], "tensorrt_llm::runtime::memorycounters::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10SizeType32E", false]], "tensorrt_llm::runtime::memorycounters::tostring (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters8toStringEv", false]], "tensorrt_llm::runtime::memorytype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryTypeE", false]], "tensorrt_llm::runtime::memorytype::kcpu (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kCPUE", false]], "tensorrt_llm::runtime::memorytype::kgpu (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kGPUE", false]], "tensorrt_llm::runtime::memorytype::kpinned (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType7kPINNEDE", false]], "tensorrt_llm::runtime::memorytype::kpinnedpool (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType11kPINNEDPOOLE", false]], "tensorrt_llm::runtime::memorytype::kuvm (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kUVME", false]], "tensorrt_llm::runtime::memorytypestring (c++ struct)": [[1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime16MemoryTypeStringE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kcpu> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kCPUEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kcpu>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kCPUEE5valueE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kgpu> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kGPUEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kgpu>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kGPUEE5valueE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kpinned> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType7kPINNEDEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kpinned>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType7kPINNEDEE5valueE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kpinnedpool> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType11kPINNEDPOOLEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kpinnedpool>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType11kPINNEDPOOLEE5valueE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kuvm> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kUVMEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kuvm>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kUVMEE5valueE", false]], "tensorrt_llm::runtime::modelconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfigE", false]], "tensorrt_llm::runtime::modelconfig::computecontextlogits (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEv", false]], "tensorrt_llm::runtime::modelconfig::computegenerationlogits (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEv", false]], "tensorrt_llm::runtime::modelconfig::countlocallayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::countlowerranklayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::disableseamlesslookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig32disableSeamlessLookaheadDecodingEv", false]], "tensorrt_llm::runtime::modelconfig::enableseamlesslookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31enableSeamlessLookaheadDecodingE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getcontextfmha (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getContextFMHAEv", false]], "tensorrt_llm::runtime::modelconfig::getdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getDataTypeEv", false]], "tensorrt_llm::runtime::modelconfig::getencoderhiddensize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getEncoderHiddenSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getgemmallreducedtype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getGemmAllReduceDtypeEv", false]], "tensorrt_llm::runtime::modelconfig::gethiddensize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getHiddenSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getkvcachetype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getKVCacheTypeEv", false]], "tensorrt_llm::runtime::modelconfig::getkvdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getKvDataTypeEv", false]], "tensorrt_llm::runtime::modelconfig::getlayertypes (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getLayerTypesEv", false]], "tensorrt_llm::runtime::modelconfig::getlogitsdtype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getLogitsDtypeEv", false]], "tensorrt_llm::runtime::modelconfig::getloramodules (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getLoraModulesEv", false]], "tensorrt_llm::runtime::modelconfig::getmanageweightstype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getManageWeightsTypeEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxbatchsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxBatchSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxbeamwidth (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxBeamWidthEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxdecodingdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig25getMaxDecodingDraftTokensEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxdecodingtokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getMaxDecodingTokensEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxencoderlen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16getMaxEncoderLenEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxinputlen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getMaxInputLenEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxlorarank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getMaxLoraRankEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxnumtokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxNumTokensEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxpositionembeddings (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig24getMaxPositionEmbeddingsEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxpromptembeddingtablesize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig30getMaxPromptEmbeddingTableSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxsequencelen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17getMaxSequenceLenEv", false]], "tensorrt_llm::runtime::modelconfig::getmlphiddensize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16getMlpHiddenSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getmodelname (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getModelNameEv", false]], "tensorrt_llm::runtime::modelconfig::getmodelvariant (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getModelVariantEv", false]], "tensorrt_llm::runtime::modelconfig::getnbattentionlayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getnbheads (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig10getNbHeadsEv", false]], "tensorrt_llm::runtime::modelconfig::getnbkvheads (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getNbKvHeadsE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getnblayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getnbrnnlayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getnumkvheadsperlayer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getNumKvHeadsPerLayerEv", false]], "tensorrt_llm::runtime::modelconfig::getnumkvheadsperlayerlocalrange (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", false]], "tensorrt_llm::runtime::modelconfig::getnumlanguages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getNumLanguagesEv", false]], "tensorrt_llm::runtime::modelconfig::getoptprofilessplitpoints (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig25getOptProfilesSplitPointsEv", false]], "tensorrt_llm::runtime::modelconfig::getpagedcontextfmha (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19getPagedContextFMHAEv", false]], "tensorrt_llm::runtime::modelconfig::getppreducescatter (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getPpReduceScatterEv", false]], "tensorrt_llm::runtime::modelconfig::getquantmode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getQuantModeEv", false]], "tensorrt_llm::runtime::modelconfig::getrnnconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getRnnConfigEv", false]], "tensorrt_llm::runtime::modelconfig::getrotaryembeddingdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getRotaryEmbeddingDimEv", false]], "tensorrt_llm::runtime::modelconfig::getsizeperhead (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getSizePerHeadEv", false]], "tensorrt_llm::runtime::modelconfig::getspeculativedecodingmode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig26getSpeculativeDecodingModeEv", false]], "tensorrt_llm::runtime::modelconfig::getspeculativedecodingmodule (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig28getSpeculativeDecodingModuleEv", false]], "tensorrt_llm::runtime::modelconfig::getspeculativedecodingmoduleptr (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31getSpeculativeDecodingModulePtrEv", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getSpeculativeDecodingModulePtrEv", false]], "tensorrt_llm::runtime::modelconfig::getsumlocalkvheads (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", false]], "tensorrt_llm::runtime::modelconfig::gettokensperblock (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17getTokensPerBlockEv", false]], "tensorrt_llm::runtime::modelconfig::getvocabsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getVocabSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getvocabsizepadded (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getVocabSizePaddedE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::hasrnnconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12hasRnnConfigEv", false]], "tensorrt_llm::runtime::modelconfig::hasspeculativedecodingmodule (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig28hasSpeculativeDecodingModuleEv", false]], "tensorrt_llm::runtime::modelconfig::iscontinuouskvcache (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19isContinuousKVCacheEv", false]], "tensorrt_llm::runtime::modelconfig::iskvcacheenabled (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16isKVCacheEnabledEv", false]], "tensorrt_llm::runtime::modelconfig::ismultimodal (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12isMultiModalEv", false]], "tensorrt_llm::runtime::modelconfig::ispagedkvcache (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14isPagedKVCacheEv", false]], "tensorrt_llm::runtime::modelconfig::isrnnbased (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig10isRnnBasedEv", false]], "tensorrt_llm::runtime::modelconfig::istransformerbased (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18isTransformerBasedEv", false]], "tensorrt_llm::runtime::modelconfig::iswhisper (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig9isWhisperEv", false]], "tensorrt_llm::runtime::modelconfig::kdefault_num_tokens_per_block (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig29kDEFAULT_NUM_TOKENS_PER_BLOCKE", false]], "tensorrt_llm::runtime::modelconfig::kopt_profiles_split_points (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26kOPT_PROFILES_SPLIT_POINTSE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheTypeE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetype::kcontinuous (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType11kCONTINUOUSE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetype::kdisabled (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType9kDISABLEDE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetype::kpaged (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType6kPAGEDE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetypefromstring (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21KVCacheTypeFromStringENSt6stringE", false]], "tensorrt_llm::runtime::modelconfig::layertype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerTypeE", false]], "tensorrt_llm::runtime::modelconfig::layertype::kattention (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType10kATTENTIONE", false]], "tensorrt_llm::runtime::modelconfig::layertype::klinear (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType7kLINEARE", false]], "tensorrt_llm::runtime::modelconfig::layertype::knoop (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType5kNOOPE", false]], "tensorrt_llm::runtime::modelconfig::layertype::krecurrent (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType10kRECURRENTE", false]], "tensorrt_llm::runtime::modelconfig::manageweightstype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsTypeE", false]], "tensorrt_llm::runtime::modelconfig::manageweightstype::kdisabled (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsType9kDisabledE", false]], "tensorrt_llm::runtime::modelconfig::manageweightstype::kenabled (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsType8kEnabledE", false]], "tensorrt_llm::runtime::modelconfig::mcomputecontextlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mComputeContextLogitsE", false]], "tensorrt_llm::runtime::modelconfig::mcomputegenerationlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24mComputeGenerationLogitsE", false]], "tensorrt_llm::runtime::modelconfig::mcontextfmha (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mContextFMHAE", false]], "tensorrt_llm::runtime::modelconfig::mdatatype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mDataTypeE", false]], "tensorrt_llm::runtime::modelconfig::mencoderhiddensize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mEncoderHiddenSizeE", false]], "tensorrt_llm::runtime::modelconfig::mgemmallreducedtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19mGemmAllReduceDtypeE", false]], "tensorrt_llm::runtime::modelconfig::mhiddensize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mHiddenSizeE", false]], "tensorrt_llm::runtime::modelconfig::minputpacked (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mInputPackedE", false]], "tensorrt_llm::runtime::modelconfig::mkvcachetype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mKVCacheTypeE", false]], "tensorrt_llm::runtime::modelconfig::mlayertypes (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mLayerTypesE", false]], "tensorrt_llm::runtime::modelconfig::mlogitsdtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mLogitsDtypeE", false]], "tensorrt_llm::runtime::modelconfig::mloramodules (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mLoraModulesE", false]], "tensorrt_llm::runtime::modelconfig::mmanageweightstype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mManageWeightsTypeE", false]], "tensorrt_llm::runtime::modelconfig::mmaxbatchsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxBatchSizeE", false]], "tensorrt_llm::runtime::modelconfig::mmaxbeamwidth (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxBeamWidthE", false]], "tensorrt_llm::runtime::modelconfig::mmaxencoderlen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mMaxEncoderLenE", false]], "tensorrt_llm::runtime::modelconfig::mmaxinputlen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mMaxInputLenE", false]], "tensorrt_llm::runtime::modelconfig::mmaxlorarank (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mMaxLoraRankE", false]], "tensorrt_llm::runtime::modelconfig::mmaxnumtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxNumTokensE", false]], "tensorrt_llm::runtime::modelconfig::mmaxpositionembeddings (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mMaxPositionEmbeddingsE", false]], "tensorrt_llm::runtime::modelconfig::mmaxpromptembeddingtablesize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28mMaxPromptEmbeddingTableSizeE", false]], "tensorrt_llm::runtime::modelconfig::mmaxsequencelen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15mMaxSequenceLenE", false]], "tensorrt_llm::runtime::modelconfig::mmlphiddensize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mMlpHiddenSizeE", false]], "tensorrt_llm::runtime::modelconfig::mmodelname (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mModelNameE", false]], "tensorrt_llm::runtime::modelconfig::mmodelvariant (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mModelVariantE", false]], "tensorrt_llm::runtime::modelconfig::mnbattentionlayers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mNbAttentionLayersE", false]], "tensorrt_llm::runtime::modelconfig::mnbheads (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig8mNbHeadsE", false]], "tensorrt_llm::runtime::modelconfig::mnblayers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mNbLayersE", false]], "tensorrt_llm::runtime::modelconfig::mnbrnnlayers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mNbRnnLayersE", false]], "tensorrt_llm::runtime::modelconfig::mnumkvheadsperattentionlayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28mNumKvHeadsPerAttentionLayerE", false]], "tensorrt_llm::runtime::modelconfig::mnumkvheadspercrossattentionlayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig33mNumKvHeadsPerCrossAttentionLayerE", false]], "tensorrt_llm::runtime::modelconfig::mnumlanguages (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mNumLanguagesE", false]], "tensorrt_llm::runtime::modelconfig::modelconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariantE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kchatglm (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant8kChatGlmE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kencdec (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant7kEncDecE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kglm (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant4kGlmE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kgpt (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant4kGptE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kmamba (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant6kMambaE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::krecurrentgemma (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant15kRecurrentGemmaE", false]], "tensorrt_llm::runtime::modelconfig::mpagedcontextfmha (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17mPagedContextFMHAE", false]], "tensorrt_llm::runtime::modelconfig::mpagedstate (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mPagedStateE", false]], "tensorrt_llm::runtime::modelconfig::mppreducescatter (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16mPpReduceScatterE", false]], "tensorrt_llm::runtime::modelconfig::mquantmode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mQuantModeE", false]], "tensorrt_llm::runtime::modelconfig::mrnnconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mRnnConfigE", false]], "tensorrt_llm::runtime::modelconfig::mrotaryembeddingdim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19mRotaryEmbeddingDimE", false]], "tensorrt_llm::runtime::modelconfig::msizeperhead (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mSizePerHeadE", false]], "tensorrt_llm::runtime::modelconfig::mskipcrossattnblocks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20mSkipCrossAttnBlocksE", false]], "tensorrt_llm::runtime::modelconfig::mspeculativedecodingmode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24mSpeculativeDecodingModeE", false]], "tensorrt_llm::runtime::modelconfig::mspeculativedecodingmodule (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26mSpeculativeDecodingModuleE", false]], "tensorrt_llm::runtime::modelconfig::mtokensperblock (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15mTokensPerBlockE", false]], "tensorrt_llm::runtime::modelconfig::musecrossattention (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mUseCrossAttentionE", false]], "tensorrt_llm::runtime::modelconfig::musegemmallreduceplugin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23mUseGemmAllReducePluginE", false]], "tensorrt_llm::runtime::modelconfig::musegptattentionplugin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mUseGptAttentionPluginE", false]], "tensorrt_llm::runtime::modelconfig::museloraplugin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mUseLoraPluginE", false]], "tensorrt_llm::runtime::modelconfig::musemambaconv1dplugin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mUseMambaConv1dPluginE", false]], "tensorrt_llm::runtime::modelconfig::musemrope (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mUseMropeE", false]], "tensorrt_llm::runtime::modelconfig::musepositionembedding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mUsePositionEmbeddingE", false]], "tensorrt_llm::runtime::modelconfig::museshapeinference (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mUseShapeInferenceE", false]], "tensorrt_llm::runtime::modelconfig::musetokentypeembedding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mUseTokenTypeEmbeddingE", false]], "tensorrt_llm::runtime::modelconfig::mvocabsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mVocabSizeE", false]], "tensorrt_llm::runtime::modelconfig::resetspeculativedecodingmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30resetSpeculativeDecodingModuleEv", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfigE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::convkernel (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig10convKernelE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::rnnconvdimsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig14rnnConvDimSizeE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::rnnheadsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig11rnnHeadSizeE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::rnnhiddensize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig13rnnHiddenSizeE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::statesize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig9stateSizeE", false]], "tensorrt_llm::runtime::modelconfig::setcontextfmha (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setContextFMHAEb", false]], "tensorrt_llm::runtime::modelconfig::setencoderhiddensize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setEncoderHiddenSizeE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setgemmallreducedtype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setGemmAllReduceDtypeEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::modelconfig::setkvcachetype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setKVCacheTypeE11KVCacheType", false]], "tensorrt_llm::runtime::modelconfig::setlayertypes (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13setLayerTypesERKNSt6vectorI9LayerTypeEE", false]], "tensorrt_llm::runtime::modelconfig::setlogitsdtype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLogitsDtypeEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::modelconfig::setloramodules (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLoraModulesERKNSt6vectorI10LoraModuleEE", false]], "tensorrt_llm::runtime::modelconfig::setmanageweightstype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setManageWeightsTypeEK17ManageWeightsType", false]], "tensorrt_llm::runtime::modelconfig::setmaxbatchsize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBatchSizeE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxbeamwidth (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBeamWidthE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxencoderlen (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMaxEncoderLenE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxinputlen (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxInputLenE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxlorarank (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxLoraRankE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxnumtokens (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxNumTokensENSt8optionalI10SizeType32EE", false]], "tensorrt_llm::runtime::modelconfig::setmaxpositionembeddings (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setMaxPositionEmbeddingsE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxpromptembeddingtablesize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30setMaxPromptEmbeddingTableSizeE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxsequencelen (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setMaxSequenceLenE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmlphiddensize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMlpHiddenSizeE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmodelname (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setModelNameERKNSt6stringE", false]], "tensorrt_llm::runtime::modelconfig::setmodelvariant (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setModelVariantE12ModelVariant", false]], "tensorrt_llm::runtime::modelconfig::setnbcrosskvheads (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setNbCrossKvHeadsE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setnbkvheads (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setNbKvHeadsE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setnumkvheadspercrosslayer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setNumKvHeadsPerCrossLayerERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::runtime::modelconfig::setnumkvheadsperlayer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setNumKvHeadsPerLayerERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::runtime::modelconfig::setnumlanguages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setNumLanguagesENSt8optionalI10SizeType32EE", false]], "tensorrt_llm::runtime::modelconfig::setpagedcontextfmha (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19setPagedContextFMHAEb", false]], "tensorrt_llm::runtime::modelconfig::setppreducescatter (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18setPpReduceScatterEb", false]], "tensorrt_llm::runtime::modelconfig::setquantmode (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setQuantModeEN6common9QuantModeE", false]], "tensorrt_llm::runtime::modelconfig::setrnnconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setRnnConfigERK9RnnConfig", false]], "tensorrt_llm::runtime::modelconfig::setrotaryembeddingdim (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setRotaryEmbeddingDimE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setsizeperhead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setSizePerHeadE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setskipcrossattnblocks (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22setSkipCrossAttnBlocksEb", false]], "tensorrt_llm::runtime::modelconfig::setspeculativedecodingmode (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setSpeculativeDecodingModeE23SpeculativeDecodingMode", false]], "tensorrt_llm::runtime::modelconfig::setspeculativedecodingmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28setSpeculativeDecodingModuleERKNSt10shared_ptrI25SpeculativeDecodingModuleEE", false]], "tensorrt_llm::runtime::modelconfig::settokensperblock (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setTokensPerBlockE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setusecrossattention (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseCrossAttentionEb", false]], "tensorrt_llm::runtime::modelconfig::setusemrope (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11setUseMropeEb", false]], "tensorrt_llm::runtime::modelconfig::setusepositionembedding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23setUsePositionEmbeddingEb", false]], "tensorrt_llm::runtime::modelconfig::setuseshapeinference (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseShapeInferenceEb", false]], "tensorrt_llm::runtime::modelconfig::setusetokentypeembedding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setUseTokenTypeEmbeddingEb", false]], "tensorrt_llm::runtime::modelconfig::skipcrossattnblocks (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19skipCrossAttnBlocksEv", false]], "tensorrt_llm::runtime::modelconfig::supportsinflightbatching (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig24supportsInflightBatchingEv", false]], "tensorrt_llm::runtime::modelconfig::usecrossattention (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17useCrossAttentionEv", false]], "tensorrt_llm::runtime::modelconfig::usegemmallreduceplugin (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEv", false]], "tensorrt_llm::runtime::modelconfig::usegptattentionplugin (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEv", false]], "tensorrt_llm::runtime::modelconfig::uselanguageadapter (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18useLanguageAdapterEv", false]], "tensorrt_llm::runtime::modelconfig::useloraplugin (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13useLoraPluginEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13useLoraPluginEv", false]], "tensorrt_llm::runtime::modelconfig::usemambaconv1dplugin (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEv", false]], "tensorrt_llm::runtime::modelconfig::usemrope (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig8useMropeEv", false]], "tensorrt_llm::runtime::modelconfig::usepackedinput (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14usePackedInputEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14usePackedInputEv", false]], "tensorrt_llm::runtime::modelconfig::usepagedstate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13usePagedStateEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13usePagedStateEv", false]], "tensorrt_llm::runtime::modelconfig::usepositionembedding (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20usePositionEmbeddingEv", false]], "tensorrt_llm::runtime::modelconfig::useprompttuning (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15usePromptTuningEv", false]], "tensorrt_llm::runtime::modelconfig::useshapeinference (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17useShapeInferenceEv", false]], "tensorrt_llm::runtime::modelconfig::usetokentypeembedding (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21useTokenTypeEmbeddingEv", false]], "tensorrt_llm::runtime::mpi_group_barrier (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17MPI_group_barrierENSt3setIiEE", false]], "tensorrt_llm::runtime::operator<< (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK10LoraModule", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK26LoraCachePageManagerConfig", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7IBuffer", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7ITensor", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN7ITensor5ShapeE", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN9LoraCache21TaskLayerModuleConfigE", false]], "tensorrt_llm::runtime::pointerelementtype (c++ type)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime18PointerElementTypeE", false]], "tensorrt_llm::runtime::prompttuningparams (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParamsE", false]], "tensorrt_llm::runtime::prompttuningparams::filltaskstensor (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", false]], "tensorrt_llm::runtime::prompttuningparams::prompttuningparams (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", false]], "tensorrt_llm::runtime::prompttuningparams::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams10SizeType32E", false]], "tensorrt_llm::runtime::prompttuningparams::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams9TensorPtrE", false]], "tensorrt_llm::runtime::rawengine (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngineE", false]], "tensorrt_llm::runtime::rawengine::getaddress (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine10getAddressEv", false]], "tensorrt_llm::runtime::rawengine::gethostmemory (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine13getHostMemoryEv", false]], "tensorrt_llm::runtime::rawengine::getmanagedweightsmapopt (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine23getManagedWeightsMapOptEv", false]], "tensorrt_llm::runtime::rawengine::getpath (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getPathEv", false]], "tensorrt_llm::runtime::rawengine::getpathopt (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine10getPathOptEv", false]], "tensorrt_llm::runtime::rawengine::getsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getSizeEv", false]], "tensorrt_llm::runtime::rawengine::gettype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getTypeEv", false]], "tensorrt_llm::runtime::rawengine::mengineaddr (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEngineAddrE", false]], "tensorrt_llm::runtime::rawengine::menginebuffer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine13mEngineBufferE", false]], "tensorrt_llm::runtime::rawengine::menginepath (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEnginePathE", false]], "tensorrt_llm::runtime::rawengine::menginesize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEngineSizeE", false]], "tensorrt_llm::runtime::rawengine::mmanagedweightsmap (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine18mManagedWeightsMapE", false]], "tensorrt_llm::runtime::rawengine::mtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine5mTypeE", false]], "tensorrt_llm::runtime::rawengine::rawengine (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineENSt10filesystem4pathE", false], [1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKN8nvinfer111IHostMemoryE", false], [1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKvNSt6size_tE", false]], "tensorrt_llm::runtime::rawengine::setmanagedweightsmap (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine20setManagedWeightsMapENSt3mapINSt6stringEN12tensorrt_llm8executor6TensorEEE", false]], "tensorrt_llm::runtime::rawengine::setpath (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine7setPathENSt10filesystem4pathE", false]], "tensorrt_llm::runtime::rawengine::type (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4TypeE", false]], "tensorrt_llm::runtime::rawengine::type::addresswithsize (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type15AddressWithSizeE", false]], "tensorrt_llm::runtime::rawengine::type::filepath (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type8FilePathE", false]], "tensorrt_llm::runtime::rawengine::type::hostmemory (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type10HostMemoryE", false]], "tensorrt_llm::runtime::requesttype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11RequestTypeE", false]], "tensorrt_llm::runtime::requesttype::kcontext (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11RequestType8kCONTEXTE", false]], "tensorrt_llm::runtime::requesttype::kgeneration (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11RequestType11kGENERATIONE", false]], "tensorrt_llm::runtime::runtimedefaults (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaultsE", false]], "tensorrt_llm::runtime::runtimedefaults::maxattentionwindowvec (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults21maxAttentionWindowVecE", false]], "tensorrt_llm::runtime::runtimedefaults::runtimedefaults (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalI10SizeType32EE", false], [1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsEv", false]], "tensorrt_llm::runtime::runtimedefaults::sinktokenlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15sinkTokenLengthE", false]], "tensorrt_llm::runtime::samplingconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfigE", false]], "tensorrt_llm::runtime::samplingconfig::beamsearchdiversityrate (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig23beamSearchDiversityRateE", false]], "tensorrt_llm::runtime::samplingconfig::beamwidth (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9beamWidthE", false]], "tensorrt_llm::runtime::samplingconfig::beamwidtharray (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14beamWidthArrayE", false]], "tensorrt_llm::runtime::samplingconfig::cumlogprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig11cumLogProbsE", false]], "tensorrt_llm::runtime::samplingconfig::draftacceptancethreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig24draftAcceptanceThresholdE", false]], "tensorrt_llm::runtime::samplingconfig::earlystopping (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig13earlyStoppingE", false]], "tensorrt_llm::runtime::samplingconfig::floattype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9FloatTypeE", false]], "tensorrt_llm::runtime::samplingconfig::frequencypenalty (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig16frequencyPenaltyE", false]], "tensorrt_llm::runtime::samplingconfig::fusevalues (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", false]], "tensorrt_llm::runtime::samplingconfig::getmaxbeamwidth (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfig15getMaxBeamWidthEv", false]], "tensorrt_llm::runtime::samplingconfig::getnumreturnbeams (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfig17getNumReturnBeamsEv", false]], "tensorrt_llm::runtime::samplingconfig::lengthpenalty (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig13lengthPenaltyE", false]], "tensorrt_llm::runtime::samplingconfig::minlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9minLengthE", false]], "tensorrt_llm::runtime::samplingconfig::minp (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4minPE", false]], "tensorrt_llm::runtime::samplingconfig::norepeatngramsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17noRepeatNgramSizeE", false]], "tensorrt_llm::runtime::samplingconfig::normalizelogprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17normalizeLogProbsE", false]], "tensorrt_llm::runtime::samplingconfig::numreturnsequences (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig18numReturnSequencesE", false]], "tensorrt_llm::runtime::samplingconfig::operator== (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfigeqERK14SamplingConfig", false]], "tensorrt_llm::runtime::samplingconfig::optvec (c++ type)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig6OptVecE", false]], "tensorrt_llm::runtime::samplingconfig::originaltemperature (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig19originalTemperatureE", false]], "tensorrt_llm::runtime::samplingconfig::outputlogprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14outputLogProbsE", false]], "tensorrt_llm::runtime::samplingconfig::presencepenalty (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig15presencePenaltyE", false]], "tensorrt_llm::runtime::samplingconfig::randomseed (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig10randomSeedE", false]], "tensorrt_llm::runtime::samplingconfig::repetitionpenalty (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17repetitionPenaltyE", false]], "tensorrt_llm::runtime::samplingconfig::samplingconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigE10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKN8executor14SamplingConfigERKNSt8optionalIN8executor25ExternalDraftTokensConfigEEE", false], [1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKNSt6vectorI14SamplingConfigEE", false]], "tensorrt_llm::runtime::samplingconfig::temperature (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig11temperatureE", false]], "tensorrt_llm::runtime::samplingconfig::topk (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4topKE", false]], "tensorrt_llm::runtime::samplingconfig::topkmedusaheads (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig15topKMedusaHeadsE", false]], "tensorrt_llm::runtime::samplingconfig::topp (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4topPE", false]], "tensorrt_llm::runtime::samplingconfig::toppdecay (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9topPDecayE", false]], "tensorrt_llm::runtime::samplingconfig::toppmin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig7topPMinE", false]], "tensorrt_llm::runtime::samplingconfig::toppresetids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig12topPResetIdsE", false]], "tensorrt_llm::runtime::samplingconfig::usedefaultvalues (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", false]], "tensorrt_llm::runtime::samplingconfig::validate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig8validateEv", false]], "tensorrt_llm::runtime::samplingconfig::validatevec (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", false]], "tensorrt_llm::runtime::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10SizeType32E", false]], "tensorrt_llm::runtime::sizetype64 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10SizeType64E", false]], "tensorrt_llm::runtime::speculativedecodingmode (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingModeE", false]], "tensorrt_llm::runtime::speculativedecodingmode::allbitset (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9allBitSetE14UnderlyingType", false]], "tensorrt_llm::runtime::speculativedecodingmode::anybitset (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9anyBitSetE14UnderlyingType", false]], "tensorrt_llm::runtime::speculativedecodingmode::drafttokensexternal (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode19DraftTokensExternalEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::eagle (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode5EagleEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::explicitdrafttokens (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode19ExplicitDraftTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::hasdraftlogits (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode14hasDraftLogitsEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::isdrafttokensexternal (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21isDraftTokensExternalEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::iseagle (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode7isEagleEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::isexplicitdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21isExplicitDraftTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::islookaheaddecoding (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19isLookaheadDecodingEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::ismedusa (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode8isMedusaEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::isnone (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode6isNoneEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::kdrafttokensexternal (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode20kDraftTokensExternalE", false]], "tensorrt_llm::runtime::speculativedecodingmode::keagle (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6kEagleE", false]], "tensorrt_llm::runtime::speculativedecodingmode::kexplicitdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode20kExplicitDraftTokensE", false]], "tensorrt_llm::runtime::speculativedecodingmode::klookaheaddecoding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode18kLookaheadDecodingE", false]], "tensorrt_llm::runtime::speculativedecodingmode::kmedusa (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode7kMedusaE", false]], "tensorrt_llm::runtime::speculativedecodingmode::knone (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode5kNoneE", false]], "tensorrt_llm::runtime::speculativedecodingmode::lookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode17LookaheadDecodingEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::medusa (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6MedusaEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::mstate (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6mStateE", false]], "tensorrt_llm::runtime::speculativedecodingmode::needsdecoderprologue (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode20needsDecoderPrologueEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::needskvcacherewind (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode18needsKVCacheRewindEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::none (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode4NoneEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::operator== (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingModeeqERK23SpeculativeDecodingMode", false]], "tensorrt_llm::runtime::speculativedecodingmode::predictsdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19predictsDraftTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::requiresattentionmask (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21requiresAttentionMaskEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::speculativedecodingmode (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode23SpeculativeDecodingModeE14UnderlyingType", false]], "tensorrt_llm::runtime::speculativedecodingmode::underlyingtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode14UnderlyingTypeE", false]], "tensorrt_llm::runtime::speculativedecodingmode::updatespositionids (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode18updatesPositionIdsEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::variabledraftlength (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19variableDraftLengthEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::computenumpackedmasks (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule21computeNumPackedMasksEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxdecodingdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule25getMaxDecodingDraftTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxdecodingtokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule20getMaxDecodingTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxdraftpathlen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule18getMaxDraftPathLenEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxnumpaths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule14getMaxNumPathsEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxpathlen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule13getMaxPathLenEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getnumpackedmasks (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule17getNumPackedMasksEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::mmaxdecodingdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule23mMaxDecodingDraftTokensE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::mmaxdraftpathlen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule16mMaxDraftPathLenE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::mmaxnumpackedmasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18mMaxNumPackedMasksE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::mmaxnumpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule12mMaxNumPathsE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleaSERK25SpeculativeDecodingModule", false]], "tensorrt_llm::runtime::speculativedecodingmodule::setmaxdraftpathlen (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18setMaxDraftPathLenE10SizeType32", false]], "tensorrt_llm::runtime::speculativedecodingmodule::setmaxdrafttokens (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule17setMaxDraftTokensE10SizeType32", false]], "tensorrt_llm::runtime::speculativedecodingmodule::setmaxnumpaths (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule14setMaxNumPathsE10SizeType32", false]], "tensorrt_llm::runtime::speculativedecodingmodule::speculativedecodingmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleERK25SpeculativeDecodingModule", false], [1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::~speculativedecodingmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleD0Ev", false]], "tensorrt_llm::runtime::stringptrmap (c++ type)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime12StringPtrMapE", false]], "tensorrt_llm::runtime::tllmlogger (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime10TllmLoggerE", false]], "tensorrt_llm::runtime::tllmlogger::getlevel (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8getLevelEv", false]], "tensorrt_llm::runtime::tllmlogger::log (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger3logE8SeverityPKN8nvinfer19AsciiCharE", false]], "tensorrt_llm::runtime::tllmlogger::setlevel (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8setLevelE8Severity", false]], "tensorrt_llm::runtime::to_string (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9to_stringERK26LoraCachePageManagerConfig", false], [1, "_CPPv4N12tensorrt_llm7runtime9to_stringERKN9LoraCache21TaskLayerModuleConfigE", false]], "tensorrt_llm::runtime::tokenextraidtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime16TokenExtraIdTypeE", false]], "tensorrt_llm::runtime::tokenidtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime11TokenIdTypeE", false]], "tensorrt_llm::runtime::trtdatatype (c++ struct)": [[1, "_CPPv4I0_bEN12tensorrt_llm7runtime11TRTDataTypeE", false]], "tensorrt_llm::runtime::trtdatatype<bool> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIbEE", false]], "tensorrt_llm::runtime::trtdatatype<bool>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIbE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<float> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIfEE", false]], "tensorrt_llm::runtime::trtdatatype<float>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIfE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<half> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeI4halfEE", false]], "tensorrt_llm::runtime::trtdatatype<half>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeI4halfE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<kernels::finishedstate> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7kernels13FinishedStateEEE", false]], "tensorrt_llm::runtime::trtdatatype<kernels::finishedstate>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7kernels13FinishedStateEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<kernels::kvcacheindex> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7kernels12KVCacheIndexEEE", false]], "tensorrt_llm::runtime::trtdatatype<kernels::kvcacheindex>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7kernels12KVCacheIndexEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<runtime::requesttype> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7runtime11RequestTypeEEE", false]], "tensorrt_llm::runtime::trtdatatype<runtime::requesttype>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7runtime11RequestTypeEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::int32_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7int32_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::int32_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7int32_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::int64_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7int64_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::int64_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7int64_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::int8_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt6int8_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::int8_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt6int8_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint32_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt8uint32_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint32_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt8uint32_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint64_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt8uint64_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint64_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt8uint64_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint8_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7uint8_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint8_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7uint8_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<t*> (c++ struct)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime11TRTDataTypeIP1TEE", false]], "tensorrt_llm::runtime::trtdatatype<t*>::kunderlyingtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIP1TE15kUnderlyingTypeE", false]], "tensorrt_llm::runtime::trtdatatype<t*>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIP1TE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<void*> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIPvEE", false]], "tensorrt_llm::runtime::trtdatatype<void*>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIPvE5valueE", false]], "tensorrt_llm::runtime::uniquetoken (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime11UniqueTokenE", false]], "tensorrt_llm::runtime::uniquetoken::operator== (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11UniqueTokeneqERK11UniqueToken", false]], "tensorrt_llm::runtime::uniquetoken::tokenextraid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11UniqueToken12tokenExtraIdE", false]], "tensorrt_llm::runtime::uniquetoken::tokenid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11UniqueToken7tokenIdE", false]], "tensorrt_llm::runtime::vectokenextraids (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime16VecTokenExtraIdsE", false]], "tensorrt_llm::runtime::vecuniquetokens (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime15VecUniqueTokensE", false]], "tensorrt_llm::runtime::worldconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::worldconfig::enableattentiondp (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig17enableAttentionDPEv", false]], "tensorrt_llm::runtime::worldconfig::getcontextparallelgroup (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig23getContextParallelGroupEv", false]], "tensorrt_llm::runtime::worldconfig::getcontextparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig21getContextParallelismEv", false]], "tensorrt_llm::runtime::worldconfig::getcontextparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getContextParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::getdevice (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig9getDeviceEv", false]], "tensorrt_llm::runtime::worldconfig::getdeviceof (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getDeviceOfE10SizeType32", false]], "tensorrt_llm::runtime::worldconfig::getgpuspergroup (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig15getGpusPerGroupEv", false]], "tensorrt_llm::runtime::worldconfig::getgpuspernode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig14getGpusPerNodeEv", false]], "tensorrt_llm::runtime::worldconfig::getlastrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getLastRankEv", false]], "tensorrt_llm::runtime::worldconfig::getlocalrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig12getLocalRankEv", false]], "tensorrt_llm::runtime::worldconfig::getnoderank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getNodeRankEv", false]], "tensorrt_llm::runtime::worldconfig::getnoderankof (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig13getNodeRankOfE10SizeType32", false]], "tensorrt_llm::runtime::worldconfig::getpipelineparallelgroup (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig24getPipelineParallelGroupEv", false]], "tensorrt_llm::runtime::worldconfig::getpipelineparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getPipelineParallelismEv", false]], "tensorrt_llm::runtime::worldconfig::getpipelineparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig23getPipelineParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::getrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig7getRankEv", false]], "tensorrt_llm::runtime::worldconfig::getsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig7getSizeEv", false]], "tensorrt_llm::runtime::worldconfig::gettensorparallelgroup (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getTensorParallelGroupEv", false]], "tensorrt_llm::runtime::worldconfig::gettensorparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig20getTensorParallelismEv", false]], "tensorrt_llm::runtime::worldconfig::gettensorparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig21getTensorParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::iscontextparallel (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig17isContextParallelEv", false]], "tensorrt_llm::runtime::worldconfig::isfirstcontextparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig26isFirstContextParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::isfirstpipelineparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig27isFirstPipelineParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::isfirsttensorparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig25isFirstTensorParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::islastpipelineparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig26isLastPipelineParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::ispipelineparallel (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig18isPipelineParallelEv", false]], "tensorrt_llm::runtime::worldconfig::istensorparallel (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig16isTensorParallelEv", false]], "tensorrt_llm::runtime::worldconfig::kdefaultgpuspernode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig19kDefaultGpusPerNodeE", false]], "tensorrt_llm::runtime::worldconfig::mcontextparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig19mContextParallelismE", false]], "tensorrt_llm::runtime::worldconfig::mdeviceids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig10mDeviceIdsE", false]], "tensorrt_llm::runtime::worldconfig::menableattentiondp (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig18mEnableAttentionDPE", false]], "tensorrt_llm::runtime::worldconfig::mgpuspernode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig12mGpusPerNodeE", false]], "tensorrt_llm::runtime::worldconfig::mpi (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", false]], "tensorrt_llm::runtime::worldconfig::mpipelineparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig20mPipelineParallelismE", false]], "tensorrt_llm::runtime::worldconfig::mrank (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig5mRankE", false]], "tensorrt_llm::runtime::worldconfig::mtensorparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig18mTensorParallelismE", false]], "tensorrt_llm::runtime::worldconfig::validmpiconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig14validMpiConfigEv", false]], "tensorrt_llm::runtime::worldconfig::worldconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", false]], "text (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.text", false]], "text_diff (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.text_diff", false]], "text_diff (tensorrt_llm.llmapi.completionoutput property)": [[65, "id4", false]], "timestepembedding (class in tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.TimestepEmbedding", false]], "timesteps (class in tensorrt_llm.layers.embedding)": [[78, "tensorrt_llm.layers.embedding.Timesteps", false]], "to_dict() (tensorrt_llm.llmapi.buildconfig method)": [[65, "tensorrt_llm.llmapi.BuildConfig.to_dict", false]], "to_dict() (tensorrt_llm.llmapi.calibconfig method)": [[65, "tensorrt_llm.llmapi.CalibConfig.to_dict", false]], "to_dict() (tensorrt_llm.llmapi.quantconfig method)": [[65, "tensorrt_llm.llmapi.QuantConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.chatglmconfig method)": [[79, "tensorrt_llm.models.ChatGLMConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.cogvlmconfig method)": [[79, "tensorrt_llm.models.CogVLMConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.dbrxconfig method)": [[79, "tensorrt_llm.models.DbrxConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.falconconfig method)": [[79, "tensorrt_llm.models.FalconConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.gemmaconfig method)": [[79, "tensorrt_llm.models.GemmaConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.gptconfig method)": [[79, "tensorrt_llm.models.GPTConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.gptjconfig method)": [[79, "tensorrt_llm.models.GPTJConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.llamaconfig method)": [[79, "tensorrt_llm.models.LLaMAConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.medusaconfig method)": [[79, "tensorrt_llm.models.MedusaConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.pretrainedconfig method)": [[79, "tensorrt_llm.models.PretrainedConfig.to_dict", false]], "to_json_file() (tensorrt_llm.models.pretrainedconfig method)": [[79, "tensorrt_llm.models.PretrainedConfig.to_json_file", false]], "to_layer_quant_config() (tensorrt_llm.models.pretrainedconfig method)": [[79, "tensorrt_llm.models.PretrainedConfig.to_layer_quant_config", false]], "to_legacy_setting() (tensorrt_llm.plugin.pluginconfig method)": [[80, "tensorrt_llm.plugin.PluginConfig.to_legacy_setting", false]], "token_drop() (tensorrt_llm.layers.embedding.labelembedding method)": [[78, "tensorrt_llm.layers.embedding.LabelEmbedding.token_drop", false]], "token_end (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig property)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_end", false]], "token_ids (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.token_ids", false]], "token_ids_diff (tensorrt_llm.llmapi.completionoutput attribute)": [[65, "tensorrt_llm.llmapi.CompletionOutput.token_ids_diff", false]], "token_ids_diff (tensorrt_llm.llmapi.completionoutput property)": [[65, "id5", false]], "token_range_retention_configs (tensorrt_llm.llmapi.kvcacheretentionconfig property)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.token_range_retention_configs", false]], "token_start (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig property)": [[65, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_start", false]], "tokenizer (tensorrt_llm.llmapi.llm attribute)": [[65, "tensorrt_llm.llmapi.LLM.tokenizer", false]], "tokenizer (tensorrt_llm.llmapi.llm property)": [[65, "id0", false]], "tokenizer_image_token() (tensorrt_llm.runtime.multimodalmodelrunner static method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.tokenizer_image_token", false]], "tokenizer_max_seq_length (tensorrt_llm.llmapi.calibconfig attribute)": [[65, "tensorrt_llm.llmapi.CalibConfig.tokenizer_max_seq_length", false]], "tokens_per_block (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.tokens_per_block", false]], "tokens_per_block (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.tokens_per_block", false]], "top_k (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.top_k", false]], "top_k (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.top_k", false]], "top_p (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.top_p", false]], "top_p (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.top_p", false]], "top_p_decay (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.top_p_decay", false]], "top_p_decay (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.top_p_decay", false]], "top_p_min (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.top_p_min", false]], "top_p_min (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.top_p_min", false]], "top_p_reset_ids (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.top_p_reset_ids", false]], "top_p_reset_ids (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.top_p_reset_ids", false]], "topk() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.topk", false]], "tp_split_dim() (tensorrt_llm.layers.linear.linear class method)": [[78, "tensorrt_llm.layers.linear.Linear.tp_split_dim", false]], "tp_split_dim() (tensorrt_llm.layers.linear.linearbase class method)": [[78, "tensorrt_llm.layers.linear.LinearBase.tp_split_dim", false]], "tp_split_dim() (tensorrt_llm.layers.linear.rowlinear class method)": [[78, "tensorrt_llm.layers.linear.RowLinear.tp_split_dim", false]], "transpose() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.transpose", false]], "transpose() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.transpose", false]], "trtllm-serve-disaggregated command line option": [[26, "cmdoption-trtllm-serve-disaggregated-c", false], [26, "cmdoption-trtllm-serve-disaggregated-r", false], [26, "cmdoption-trtllm-serve-disaggregated-t", false]], "trtllm-serve-disaggregated_mpi_worker command line option": [[26, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", false], [26, "cmdoption-trtllm-serve-disaggregated_mpi_worker-log_level", false]], "trtllm-serve-serve command line option": [[26, "cmdoption-trtllm-serve-serve-arg-MODEL", false], [26, "cmdoption-trtllm-serve-serve-backend", false], [26, "cmdoption-trtllm-serve-serve-cluster_size", false], [26, "cmdoption-trtllm-serve-serve-ep_size", false], [26, "cmdoption-trtllm-serve-serve-extra_llm_api_options", false], [26, "cmdoption-trtllm-serve-serve-gpus_per_node", false], [26, "cmdoption-trtllm-serve-serve-host", false], [26, "cmdoption-trtllm-serve-serve-kv_cache_free_gpu_memory_fraction", false], [26, "cmdoption-trtllm-serve-serve-log_level", false], [26, "cmdoption-trtllm-serve-serve-max_batch_size", false], [26, "cmdoption-trtllm-serve-serve-max_beam_width", false], [26, "cmdoption-trtllm-serve-serve-max_num_tokens", false], [26, "cmdoption-trtllm-serve-serve-max_seq_len", false], [26, "cmdoption-trtllm-serve-serve-num_postprocess_workers", false], [26, "cmdoption-trtllm-serve-serve-port", false], [26, "cmdoption-trtllm-serve-serve-pp_size", false], [26, "cmdoption-trtllm-serve-serve-reasoning_parser", false], [26, "cmdoption-trtllm-serve-serve-tokenizer", false], [26, "cmdoption-trtllm-serve-serve-tp_size", false], [26, "cmdoption-trtllm-serve-serve-trust_remote_code", false]], "trtllm_modules_to_hf_modules (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.trtllm_modules_to_hf_modules", false]], "truncate_prompt_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.truncate_prompt_tokens", false]], "twoshot (tensorrt_llm.functional.allreducestrategy attribute)": [[77, "tensorrt_llm.functional.AllReduceStrategy.TWOSHOT", false]], "ub (tensorrt_llm.functional.allreducestrategy attribute)": [[77, "tensorrt_llm.functional.AllReduceStrategy.UB", false]], "unary() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.unary", false]], "unbind() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.unbind", false]], "unbind() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.unbind", false]], "unfuse_qkv_projections() (tensorrt_llm.models.sd3transformer2dmodel method)": [[79, "tensorrt_llm.models.SD3Transformer2DModel.unfuse_qkv_projections", false]], "unpatchify() (tensorrt_llm.models.dit method)": [[79, "tensorrt_llm.models.DiT.unpatchify", false]], "unsqueeze() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.unsqueeze", false]], "unsqueeze() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.unsqueeze", false]], "update() (tensorrt_llm.llmapi.buildconfig method)": [[65, "tensorrt_llm.llmapi.BuildConfig.update", false]], "update() (tensorrt_llm.runtime.samplingconfig method)": [[82, "tensorrt_llm.runtime.SamplingConfig.update", false]], "update_from_dict() (tensorrt_llm.llmapi.buildconfig method)": [[65, "tensorrt_llm.llmapi.BuildConfig.update_from_dict", false]], "update_kv_cache_type() (tensorrt_llm.llmapi.buildconfig method)": [[65, "tensorrt_llm.llmapi.BuildConfig.update_kv_cache_type", false]], "update_output_ids_by_offset() (tensorrt_llm.runtime.generationsession method)": [[82, "tensorrt_llm.runtime.GenerationSession.update_output_ids_by_offset", false]], "update_strategy() (tensorrt_llm.functional.allreduceparams method)": [[77, "tensorrt_llm.functional.AllReduceParams.update_strategy", false]], "use_beam_hyps (tensorrt_llm.runtime.samplingconfig attribute)": [[82, "tensorrt_llm.runtime.SamplingConfig.use_beam_hyps", false]], "use_beam_search (tensorrt_llm.llmapi.samplingparams attribute)": [[65, "tensorrt_llm.llmapi.SamplingParams.use_beam_search", false]], "use_dynamic_tree (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.EagleDecodingConfig.use_dynamic_tree", false]], "use_gemm_allreduce_plugin (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.use_gemm_allreduce_plugin", false]], "use_gpt_attention_plugin (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.use_gpt_attention_plugin", false]], "use_kv_cache (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.use_kv_cache", false]], "use_lora() (tensorrt_llm.models.decodermodel method)": [[79, "tensorrt_llm.models.DecoderModel.use_lora", false]], "use_lora() (tensorrt_llm.models.encodermodel method)": [[79, "tensorrt_llm.models.EncoderModel.use_lora", false]], "use_lora() (tensorrt_llm.models.gemmaforcausallm method)": [[79, "tensorrt_llm.models.GemmaForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.gptforcausallm method)": [[79, "tensorrt_llm.models.GPTForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.llamaforcausallm method)": [[79, "tensorrt_llm.models.LLaMAForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.mllamaforcausallm method)": [[79, "tensorrt_llm.models.MLLaMAForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.phi3forcausallm method)": [[79, "tensorrt_llm.models.Phi3ForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.phiforcausallm method)": [[79, "tensorrt_llm.models.PhiForCausalLM.use_lora", false]], "use_lora_plugin (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.use_lora_plugin", false]], "use_lora_plugin (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.use_lora_plugin", false]], "use_mamba_conv1d_plugin (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.use_mamba_conv1d_plugin", false]], "use_meta_recipe (tensorrt_llm.llmapi.quantconfig attribute)": [[65, "tensorrt_llm.llmapi.QuantConfig.use_meta_recipe", false]], "use_mrope (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.use_mrope", false]], "use_prompt_tuning() (tensorrt_llm.models.encodermodel method)": [[79, "tensorrt_llm.models.EncoderModel.use_prompt_tuning", false]], "use_refit (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.use_refit", false]], "use_relaxed_acceptance_for_thinking (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[65, "tensorrt_llm.llmapi.MTPDecodingConfig.use_relaxed_acceptance_for_thinking", false]], "use_strip_plan (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.use_strip_plan", false]], "validate_positive_values() (tensorrt_llm.llmapi.lookaheaddecodingconfig class method)": [[65, "tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values", false]], "verbatim (tensorrt_llm.models.gemmaconfig attribute)": [[79, "tensorrt_llm.models.GemmaConfig.VERBATIM", false]], "video_preprocess() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.video_preprocess", false]], "view() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.view", false]], "view() (tensorrt_llm.functional.tensor method)": [[77, "tensorrt_llm.functional.Tensor.view", false]], "view() (tensorrt_llm.runtime.tensorinfo method)": [[82, "tensorrt_llm.runtime.TensorInfo.view", false]], "visual_engine_dir (tensorrt_llm.runtime.multimodalmodelrunner property)": [[82, "tensorrt_llm.runtime.MultimodalModelRunner.visual_engine_dir", false]], "visualize_network (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.visualize_network", false]], "vocab_size (tensorrt_llm.runtime.generationsession property)": [[82, "tensorrt_llm.runtime.GenerationSession.vocab_size", false]], "vocab_size (tensorrt_llm.runtime.modelconfig attribute)": [[82, "tensorrt_llm.runtime.ModelConfig.vocab_size", false]], "vocab_size (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.vocab_size", false]], "vocab_size (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.vocab_size", false]], "vocab_size_padded (tensorrt_llm.runtime.modelrunner property)": [[82, "tensorrt_llm.runtime.ModelRunner.vocab_size_padded", false]], "vocab_size_padded (tensorrt_llm.runtime.modelrunnercpp property)": [[82, "tensorrt_llm.runtime.ModelRunnerCpp.vocab_size_padded", false]], "w4a16 (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W4A16", false]], "w4a16_awq (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W4A16_AWQ", false]], "w4a16_gptq (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W4A16_GPTQ", false]], "w4a8_awq (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W4A8_AWQ", false]], "w4a8_qserve_per_channel (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_CHANNEL", false]], "w4a8_qserve_per_group (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_GROUP", false]], "w8a16 (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W8A16", false]], "w8a16_gptq (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W8A16_GPTQ", false]], "w8a8_sq_per_channel (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL", false]], "w8a8_sq_per_channel_per_tensor_plugin (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN", false]], "w8a8_sq_per_channel_per_token_plugin (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN", false]], "w8a8_sq_per_tensor_per_token_plugin (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN", false]], "w8a8_sq_per_tensor_plugin (tensorrt_llm.llmapi.quantalgo attribute)": [[65, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN", false]], "weight_loader() (tensorrt_llm.layers.attention.deepseekv2attention method)": [[78, "tensorrt_llm.layers.attention.DeepseekV2Attention.weight_loader", false]], "weight_loader() (tensorrt_llm.layers.embedding.embedding method)": [[78, "tensorrt_llm.layers.embedding.Embedding.weight_loader", false]], "weight_loader() (tensorrt_llm.layers.linear.linearbase method)": [[78, "tensorrt_llm.layers.linear.LinearBase.weight_loader", false]], "weight_sparsity (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.weight_sparsity", false]], "weight_streaming (tensorrt_llm.llmapi.buildconfig attribute)": [[65, "tensorrt_llm.llmapi.BuildConfig.weight_streaming", false]], "where() (in module tensorrt_llm.functional)": [[77, "tensorrt_llm.functional.where", false]], "whisperencoder (class in tensorrt_llm.models)": [[79, "tensorrt_llm.models.WhisperEncoder", false]], "workspace (tensorrt_llm.llmapi.llm attribute)": [[65, "tensorrt_llm.llmapi.LLM.workspace", false]], "workspace (tensorrt_llm.llmapi.llm property)": [[65, "id1", false]], "yarn (tensorrt_llm.functional.positionembeddingtype attribute)": [[77, "tensorrt_llm.functional.PositionEmbeddingType.yarn", false]], "yarn (tensorrt_llm.functional.rotaryscalingtype attribute)": [[77, "tensorrt_llm.functional.RotaryScalingType.yarn", false]]}, "objects": {"": [[1, 0, 1, "c.FMT_DIM", "FMT_DIM"], [1, 0, 1, "c.SET_FROM_OPTIONAL", "SET_FROM_OPTIONAL"], [1, 1, 1, "_CPPv48nvinfer1", "nvinfer1"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [0, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [1, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [1, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [1, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [1, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [0, 1, 1, "_CPPv4N12tensorrt_llm13batch_manager16kv_cache_managerE", "tensorrt_llm::batch_manager::kv_cache_manager"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutputE", "tensorrt_llm::executor::AdditionalModelOutput"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput21AdditionalModelOutputENSt6stringEb", "tensorrt_llm::executor::AdditionalModelOutput::AdditionalModelOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput21AdditionalModelOutputENSt6stringEb", "tensorrt_llm::executor::AdditionalModelOutput::AdditionalModelOutput::gatherContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput21AdditionalModelOutputENSt6stringEb", "tensorrt_llm::executor::AdditionalModelOutput::AdditionalModelOutput::name"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput13gatherContextE", "tensorrt_llm::executor::AdditionalModelOutput::gatherContext"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput4nameE", "tensorrt_llm::executor::AdditionalModelOutput::name"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor21AdditionalModelOutputeqERK21AdditionalModelOutput", "tensorrt_llm::executor::AdditionalModelOutput::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor21AdditionalModelOutputeqERK21AdditionalModelOutput", "tensorrt_llm::executor::AdditionalModelOutput::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputE", "tensorrt_llm::executor::AdditionalOutput"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputENSt6stringE6Tensor", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERK16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERR16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputENSt6stringE6Tensor", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput::name"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERK16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERR16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputENSt6stringE6Tensor", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput::output"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput4nameE", "tensorrt_llm::executor::AdditionalOutput::name"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERK16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERR16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERK16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERR16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::operator=::other"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput6outputE", "tensorrt_llm::executor::AdditionalOutput::output"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputD0Ev", "tensorrt_llm::executor::AdditionalOutput::~AdditionalOutput"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor12BatchingTypeE", "tensorrt_llm::executor::BatchingType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12BatchingType9kINFLIGHTE", "tensorrt_llm::executor::BatchingType::kINFLIGHT"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12BatchingType7kSTATICE", "tensorrt_llm::executor::BatchingType::kSTATIC"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor10BeamTokensE", "tensorrt_llm::executor::BeamTokens"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor10BufferViewE", "tensorrt_llm::executor::BufferView"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfigE", "tensorrt_llm::executor::CacheTransceiverConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig22CacheTransceiverConfigENSt8optionalI6size_tEE", "tensorrt_llm::executor::CacheTransceiverConfig::CacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig22CacheTransceiverConfigENSt8optionalI6size_tEE", "tensorrt_llm::executor::CacheTransceiverConfig::CacheTransceiverConfig::maxNumTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfig15getMaxNumTokensEv", "tensorrt_llm::executor::CacheTransceiverConfig::getMaxNumTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig13mMaxNumTokensE", "tensorrt_llm::executor::CacheTransceiverConfig::mMaxNumTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfigeqERK22CacheTransceiverConfig", "tensorrt_llm::executor::CacheTransceiverConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfigeqERK22CacheTransceiverConfig", "tensorrt_llm::executor::CacheTransceiverConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig15setMaxNumTokensE6size_t", "tensorrt_llm::executor::CacheTransceiverConfig::setMaxNumTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig15setMaxNumTokensE6size_t", "tensorrt_llm::executor::CacheTransceiverConfig::setMaxNumTokens::maxNumTokens"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicyE", "tensorrt_llm::executor::CapacitySchedulerPolicy"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy20kGUARANTEED_NO_EVICTE", "tensorrt_llm::executor::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy16kMAX_UTILIZATIONE", "tensorrt_llm::executor::CapacitySchedulerPolicy::kMAX_UTILIZATION"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy13kSTATIC_BATCHE", "tensorrt_llm::executor::CapacitySchedulerPolicy::kSTATIC_BATCH"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationModeE", "tensorrt_llm::executor::CommunicationMode"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationMode7kLEADERE", "tensorrt_llm::executor::CommunicationMode::kLEADER"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationMode13kORCHESTRATORE", "tensorrt_llm::executor::CommunicationMode::kORCHESTRATOR"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationTypeE", "tensorrt_llm::executor::CommunicationType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationType4kMPIE", "tensorrt_llm::executor::CommunicationType::kMPI"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicyE", "tensorrt_llm::executor::ContextChunkingPolicy"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicy15kEQUAL_PROGRESSE", "tensorrt_llm::executor::ContextChunkingPolicy::kEQUAL_PROGRESS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicy24kFIRST_COME_FIRST_SERVEDE", "tensorrt_llm::executor::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsE", "tensorrt_llm::executor::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsERK18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsERR18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::draftTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::draftTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::draftTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::firstGenTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::firstGenTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::firstGenTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::reqId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::reqId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::reqId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::serializedState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::state"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams13RequestIdTypeE", "tensorrt_llm::executor::ContextPhaseParams::RequestIdType"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams8StatePtrE", "tensorrt_llm::executor::ContextPhaseParams::StatePtr"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams7deleterEPKv", "tensorrt_llm::executor::ContextPhaseParams::deleter"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams7deleterEPKv", "tensorrt_llm::executor::ContextPhaseParams::deleter::data"], [0, 3, 1, "_CPPv4NKR12tensorrt_llm8executor18ContextPhaseParams14getDraftTokensEv", "tensorrt_llm::executor::ContextPhaseParams::getDraftTokens"], [0, 3, 1, "_CPPv4NKR12tensorrt_llm8executor18ContextPhaseParams17getFirstGenTokensEv", "tensorrt_llm::executor::ContextPhaseParams::getFirstGenTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams8getReqIdEv", "tensorrt_llm::executor::ContextPhaseParams::getReqId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams18getSerializedStateEv", "tensorrt_llm::executor::ContextPhaseParams::getSerializedState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams8getStateEv", "tensorrt_llm::executor::ContextPhaseParams::getState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams8getStateEv", "tensorrt_llm::executor::ContextPhaseParams::getState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams12mDraftTokensE", "tensorrt_llm::executor::ContextPhaseParams::mDraftTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams15mFirstGenTokensE", "tensorrt_llm::executor::ContextPhaseParams::mFirstGenTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams6mReqIdE", "tensorrt_llm::executor::ContextPhaseParams::mReqId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams6mStateE", "tensorrt_llm::executor::ContextPhaseParams::mState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsaSERK18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsaSERR18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::operator="], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParamseqERK18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::operator=="], [0, 3, 1, "_CPPv4NO12tensorrt_llm8executor18ContextPhaseParams17popFirstGenTokensEv", "tensorrt_llm::executor::ContextPhaseParams::popFirstGenTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams12releaseStateEv", "tensorrt_llm::executor::ContextPhaseParams::releaseState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsD0Ev", "tensorrt_llm::executor::ContextPhaseParams::~ContextPhaseParams"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverStateE", "tensorrt_llm::executor::DataTransceiverState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEN8kv_cache10CacheStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::DataTransceiverState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEv", "tensorrt_llm::executor::DataTransceiverState::DataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEN8kv_cache10CacheStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::DataTransceiverState::cacheState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEN8kv_cache10CacheStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::DataTransceiverState::commState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState13getCacheStateEv", "tensorrt_llm::executor::DataTransceiverState::getCacheState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState12getCommStateEv", "tensorrt_llm::executor::DataTransceiverState::getCommState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState11mCacheStateE", "tensorrt_llm::executor::DataTransceiverState::mCacheState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState10mCommStateE", "tensorrt_llm::executor::DataTransceiverState::mCommState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverStateeqERK20DataTransceiverState", "tensorrt_llm::executor::DataTransceiverState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverStateeqERK20DataTransceiverState", "tensorrt_llm::executor::DataTransceiverState::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState13setCacheStateEN8kv_cache10CacheStateE", "tensorrt_llm::executor::DataTransceiverState::setCacheState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState13setCacheStateEN8kv_cache10CacheStateE", "tensorrt_llm::executor::DataTransceiverState::setCacheState::state"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState12setCommStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::setCommState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState12setCommStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::setCommState::state"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState8toStringEv", "tensorrt_llm::executor::DataTransceiverState::toString"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor8DataTypeE", "tensorrt_llm::executor::DataType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kBF16E", "tensorrt_llm::executor::DataType::kBF16"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kBOOLE", "tensorrt_llm::executor::DataType::kBOOL"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kFP16E", "tensorrt_llm::executor::DataType::kFP16"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kFP32E", "tensorrt_llm::executor::DataType::kFP32"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType4kFP8E", "tensorrt_llm::executor::DataType::kFP8"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType6kINT32E", "tensorrt_llm::executor::DataType::kINT32"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType6kINT64E", "tensorrt_llm::executor::DataType::kINT64"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kINT8E", "tensorrt_llm::executor::DataType::kINT8"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType6kUINT8E", "tensorrt_llm::executor::DataType::kUINT8"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType8kUNKNOWNE", "tensorrt_llm::executor::DataType::kUNKNOWN"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfigE", "tensorrt_llm::executor::DebugConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig::debugInputTensors"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig::debugOutputTensors"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig::debugTensorNames"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig::debugTensorsMaxIterations"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig9StringVecE", "tensorrt_llm::executor::DebugConfig::StringVec"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfig20getDebugInputTensorsEv", "tensorrt_llm::executor::DebugConfig::getDebugInputTensors"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfig21getDebugOutputTensorsEv", "tensorrt_llm::executor::DebugConfig::getDebugOutputTensors"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfig19getDebugTensorNamesEv", "tensorrt_llm::executor::DebugConfig::getDebugTensorNames"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfig28getDebugTensorsMaxIterationsEv", "tensorrt_llm::executor::DebugConfig::getDebugTensorsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig18mDebugInputTensorsE", "tensorrt_llm::executor::DebugConfig::mDebugInputTensors"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig19mDebugOutputTensorsE", "tensorrt_llm::executor::DebugConfig::mDebugOutputTensors"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig17mDebugTensorNamesE", "tensorrt_llm::executor::DebugConfig::mDebugTensorNames"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig26mDebugTensorsMaxIterationsE", "tensorrt_llm::executor::DebugConfig::mDebugTensorsMaxIterations"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfigeqERK11DebugConfig", "tensorrt_llm::executor::DebugConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfigeqERK11DebugConfig", "tensorrt_llm::executor::DebugConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig20setDebugInputTensorsEb", "tensorrt_llm::executor::DebugConfig::setDebugInputTensors"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig20setDebugInputTensorsEb", "tensorrt_llm::executor::DebugConfig::setDebugInputTensors::debugInputTensors"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig21setDebugOutputTensorsEb", "tensorrt_llm::executor::DebugConfig::setDebugOutputTensors"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig21setDebugOutputTensorsEb", "tensorrt_llm::executor::DebugConfig::setDebugOutputTensors::debugOutputTensors"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig19setDebugTensorNamesERK9StringVec", "tensorrt_llm::executor::DebugConfig::setDebugTensorNames"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig19setDebugTensorNamesERK9StringVec", "tensorrt_llm::executor::DebugConfig::setDebugTensorNames::debugTensorNames"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig28setDebugTensorsMaxIterationsE10SizeType32", "tensorrt_llm::executor::DebugConfig::setDebugTensorsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig28setDebugTensorsMaxIterationsE10SizeType32", "tensorrt_llm::executor::DebugConfig::setDebugTensorsMaxIterations::debugTensorsMaxIterations"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIterationE", "tensorrt_llm::executor::DebugTensorsPerIteration"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIteration12debugTensorsE", "tensorrt_llm::executor::DebugTensorsPerIteration::debugTensors"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIteration4iterE", "tensorrt_llm::executor::DebugTensorsPerIteration::iter"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfigE", "tensorrt_llm::executor::DecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig::decodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig::eagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig::lookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig::medusaChoices"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig31enableSeamlessLookaheadDecodingEv", "tensorrt_llm::executor::DecodingConfig::enableSeamlessLookaheadDecoding"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig15getDecodingModeEv", "tensorrt_llm::executor::DecodingConfig::getDecodingMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig14getEagleConfigEv", "tensorrt_llm::executor::DecodingConfig::getEagleConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig26getLookaheadDecodingConfigEv", "tensorrt_llm::executor::DecodingConfig::getLookaheadDecodingConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig33getLookaheadDecodingMaxNumRequestEv", "tensorrt_llm::executor::DecodingConfig::getLookaheadDecodingMaxNumRequest"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig16getMedusaChoicesEv", "tensorrt_llm::executor::DecodingConfig::getMedusaChoices"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig13mDecodingModeE", "tensorrt_llm::executor::DecodingConfig::mDecodingMode"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig12mEagleConfigE", "tensorrt_llm::executor::DecodingConfig::mEagleConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig24mLookaheadDecodingConfigE", "tensorrt_llm::executor::DecodingConfig::mLookaheadDecodingConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig31mLookaheadDecodingMaxNumRequestE", "tensorrt_llm::executor::DecodingConfig::mLookaheadDecodingMaxNumRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14mMedusaChoicesE", "tensorrt_llm::executor::DecodingConfig::mMedusaChoices"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfigeqERK14DecodingConfig", "tensorrt_llm::executor::DecodingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfigeqERK14DecodingConfig", "tensorrt_llm::executor::DecodingConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig15setDecodingModeERK12DecodingMode", "tensorrt_llm::executor::DecodingConfig::setDecodingMode"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14setEagleConfigERK11EagleConfig", "tensorrt_llm::executor::DecodingConfig::setEagleConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig26setLookaheadDecodingConfigERK23LookaheadDecodingConfig", "tensorrt_llm::executor::DecodingConfig::setLookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig26setLookaheadDecodingConfigERK23LookaheadDecodingConfig", "tensorrt_llm::executor::DecodingConfig::setLookaheadDecodingConfig::lookaheadDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig16setMedusaChoicesERK13MedusaChoices", "tensorrt_llm::executor::DecodingConfig::setMedusaChoices"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12DecodingModeE", "tensorrt_llm::executor::DecodingMode"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode4AutoEv", "tensorrt_llm::executor::DecodingMode::Auto"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode10BeamSearchEv", "tensorrt_llm::executor::DecodingMode::BeamSearch"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12DecodingModeE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::DecodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12DecodingModeE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::DecodingMode::state"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode5EagleEv", "tensorrt_llm::executor::DecodingMode::Eagle"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19ExplicitDraftTokensEv", "tensorrt_llm::executor::DecodingMode::ExplicitDraftTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19ExternalDraftTokensEv", "tensorrt_llm::executor::DecodingMode::ExternalDraftTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode9LookaheadEv", "tensorrt_llm::executor::DecodingMode::Lookahead"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode6MedusaEv", "tensorrt_llm::executor::DecodingMode::Medusa"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode4TopKEv", "tensorrt_llm::executor::DecodingMode::TopK"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8TopKTopPEv", "tensorrt_llm::executor::DecodingMode::TopKTopP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode4TopPEv", "tensorrt_llm::executor::DecodingMode::TopP"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode14UnderlyingTypeE", "tensorrt_llm::executor::DecodingMode::UnderlyingType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9allBitSetE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::allBitSet"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9allBitSetE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::allBitSet::bits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9anyBitSetE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::anyBitSet"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9anyBitSetE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::anyBitSet::bits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode7getNameEv", "tensorrt_llm::executor::DecodingMode::getName"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode8getStateEv", "tensorrt_llm::executor::DecodingMode::getState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isAutoEv", "tensorrt_llm::executor::DecodingMode::isAuto"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isBeamSearchEv", "tensorrt_llm::executor::DecodingMode::isBeamSearch"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode7isEagleEv", "tensorrt_llm::executor::DecodingMode::isEagle"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isExplicitDraftTokensEv", "tensorrt_llm::executor::DecodingMode::isExplicitDraftTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isExternalDraftTokensEv", "tensorrt_llm::executor::DecodingMode::isExternalDraftTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode11isLookaheadEv", "tensorrt_llm::executor::DecodingMode::isLookahead"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode8isMedusaEv", "tensorrt_llm::executor::DecodingMode::isMedusa"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isTopKEv", "tensorrt_llm::executor::DecodingMode::isTopK"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode13isTopKandTopPEv", "tensorrt_llm::executor::DecodingMode::isTopKandTopP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isTopKorTopPEv", "tensorrt_llm::executor::DecodingMode::isTopKorTopP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isTopPEv", "tensorrt_llm::executor::DecodingMode::isTopP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseBanTokensEv", "tensorrt_llm::executor::DecodingMode::isUseBanTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode13isUseBanWordsEv", "tensorrt_llm::executor::DecodingMode::isUseBanWords"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode20isUseExplicitEosStopEv", "tensorrt_llm::executor::DecodingMode::isUseExplicitEosStop"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isUseFrequencyPenaltyEv", "tensorrt_llm::executor::DecodingMode::isUseFrequencyPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode18isUseMaxLengthStopEv", "tensorrt_llm::executor::DecodingMode::isUseMaxLengthStop"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseMinLengthEv", "tensorrt_llm::executor::DecodingMode::isUseMinLength"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9isUseMinPEv", "tensorrt_llm::executor::DecodingMode::isUseMinP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseNoRepeatNgramSizeEv", "tensorrt_llm::executor::DecodingMode::isUseNoRepeatNgramSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseOccurrencePenaltyEv", "tensorrt_llm::executor::DecodingMode::isUseOccurrencePenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isUsePenaltyEv", "tensorrt_llm::executor::DecodingMode::isUsePenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode20isUsePresencePenaltyEv", "tensorrt_llm::executor::DecodingMode::isUsePresencePenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseRepetitionPenaltyEv", "tensorrt_llm::executor::DecodingMode::isUseRepetitionPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode17isUseStopCriteriaEv", "tensorrt_llm::executor::DecodingMode::isUseStopCriteria"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseStopWordsEv", "tensorrt_llm::executor::DecodingMode::isUseStopWords"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode16isUseTemperatureEv", "tensorrt_llm::executor::DecodingMode::isUseTemperature"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode28isUseVariableBeamWidthSearchEv", "tensorrt_llm::executor::DecodingMode::isUseVariableBeamWidthSearch"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kAutoE", "tensorrt_llm::executor::DecodingMode::kAuto"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode11kBeamSearchE", "tensorrt_llm::executor::DecodingMode::kBeamSearch"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode6kEagleE", "tensorrt_llm::executor::DecodingMode::kEagle"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20kExplicitDraftTokensE", "tensorrt_llm::executor::DecodingMode::kExplicitDraftTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20kExternalDraftTokensE", "tensorrt_llm::executor::DecodingMode::kExternalDraftTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode10kLookaheadE", "tensorrt_llm::executor::DecodingMode::kLookahead"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode7kMedusaE", "tensorrt_llm::executor::DecodingMode::kMedusa"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode9kNumFlagsE", "tensorrt_llm::executor::DecodingMode::kNumFlags"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kTopKE", "tensorrt_llm::executor::DecodingMode::kTopK"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode9kTopKTopPE", "tensorrt_llm::executor::DecodingMode::kTopKTopP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kTopPE", "tensorrt_llm::executor::DecodingMode::kTopP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseBanTokensE", "tensorrt_llm::executor::DecodingMode::kUseBanTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12kUseBanWordsE", "tensorrt_llm::executor::DecodingMode::kUseBanWords"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19kUseExplicitEosStopE", "tensorrt_llm::executor::DecodingMode::kUseExplicitEosStop"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode22kUseFrequencyPenaltiesE", "tensorrt_llm::executor::DecodingMode::kUseFrequencyPenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode17kUseMaxLengthStopE", "tensorrt_llm::executor::DecodingMode::kUseMaxLengthStop"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseMinLengthE", "tensorrt_llm::executor::DecodingMode::kUseMinLength"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8kUseMinPE", "tensorrt_llm::executor::DecodingMode::kUseMinP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode21kUseNoRepeatNgramSizeE", "tensorrt_llm::executor::DecodingMode::kUseNoRepeatNgramSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode23kUseOccurrencePenaltiesE", "tensorrt_llm::executor::DecodingMode::kUseOccurrencePenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUsePenaltiesE", "tensorrt_llm::executor::DecodingMode::kUsePenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode21kUsePresencePenaltiesE", "tensorrt_llm::executor::DecodingMode::kUsePresencePenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode23kUseRepetitionPenaltiesE", "tensorrt_llm::executor::DecodingMode::kUseRepetitionPenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode24kUseStandardStopCriteriaE", "tensorrt_llm::executor::DecodingMode::kUseStandardStopCriteria"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseStopWordsE", "tensorrt_llm::executor::DecodingMode::kUseStopWords"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode15kUseTemperatureE", "tensorrt_llm::executor::DecodingMode::kUseTemperature"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode27kUseVariableBeamWidthSearchE", "tensorrt_llm::executor::DecodingMode::kUseVariableBeamWidthSearch"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode6mStateE", "tensorrt_llm::executor::DecodingMode::mState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingModeeqERK12DecodingMode", "tensorrt_llm::executor::DecodingMode::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingModeeqERK12DecodingMode", "tensorrt_llm::executor::DecodingMode::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8setBitToE14UnderlyingTypeb", "tensorrt_llm::executor::DecodingMode::setBitTo"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8setBitToE14UnderlyingTypeb", "tensorrt_llm::executor::DecodingMode::setBitTo::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8setBitToE14UnderlyingTypeb", "tensorrt_llm::executor::DecodingMode::setBitTo::x"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useBanTokensEb", "tensorrt_llm::executor::DecodingMode::useBanTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useBanTokensEb", "tensorrt_llm::executor::DecodingMode::useBanTokens::banTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode11useBanWordsEb", "tensorrt_llm::executor::DecodingMode::useBanWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode11useBanWordsEb", "tensorrt_llm::executor::DecodingMode::useBanWords::banWords"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode18useExplicitEosStopEb", "tensorrt_llm::executor::DecodingMode::useExplicitEosStop"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode18useExplicitEosStopEb", "tensorrt_llm::executor::DecodingMode::useExplicitEosStop::explicitEosStop"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19useFrequencyPenaltyEb", "tensorrt_llm::executor::DecodingMode::useFrequencyPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19useFrequencyPenaltyEb", "tensorrt_llm::executor::DecodingMode::useFrequencyPenalty::usePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode16useMaxLengthStopEb", "tensorrt_llm::executor::DecodingMode::useMaxLengthStop"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode16useMaxLengthStopEb", "tensorrt_llm::executor::DecodingMode::useMaxLengthStop::maxLengthStop"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useMinLengthEb", "tensorrt_llm::executor::DecodingMode::useMinLength"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useMinLengthEb", "tensorrt_llm::executor::DecodingMode::useMinLength::useMinLen"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode7useMinPEb", "tensorrt_llm::executor::DecodingMode::useMinP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode7useMinPEb", "tensorrt_llm::executor::DecodingMode::useMinP::useMinP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useNoRepeatNgramSizeEb", "tensorrt_llm::executor::DecodingMode::useNoRepeatNgramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useNoRepeatNgramSizeEb", "tensorrt_llm::executor::DecodingMode::useNoRepeatNgramSize::noRepeatNgramSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode22useOccurrencePenaltiesEb", "tensorrt_llm::executor::DecodingMode::useOccurrencePenalties"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode22useOccurrencePenaltiesEb", "tensorrt_llm::executor::DecodingMode::useOccurrencePenalties::usePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode18usePresencePenaltyEb", "tensorrt_llm::executor::DecodingMode::usePresencePenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode18usePresencePenaltyEb", "tensorrt_llm::executor::DecodingMode::usePresencePenalty::usePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useRepetitionPenaltyEb", "tensorrt_llm::executor::DecodingMode::useRepetitionPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useRepetitionPenaltyEb", "tensorrt_llm::executor::DecodingMode::useRepetitionPenalty::usePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useStopWordsEb", "tensorrt_llm::executor::DecodingMode::useStopWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useStopWordsEb", "tensorrt_llm::executor::DecodingMode::useStopWords::stopWords"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode14useTemperatureEb", "tensorrt_llm::executor::DecodingMode::useTemperature"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode14useTemperatureEb", "tensorrt_llm::executor::DecodingMode::useTemperature::useTemp"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode26useVariableBeamWidthSearchEb", "tensorrt_llm::executor::DecodingMode::useVariableBeamWidthSearch"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode26useVariableBeamWidthSearchEb", "tensorrt_llm::executor::DecodingMode::useVariableBeamWidthSearch::useVariableBeamWidthSearch"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStatsE", "tensorrt_llm::executor::DisServingRequestStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStats11kvCacheSizeE", "tensorrt_llm::executor::DisServingRequestStats::kvCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStats17kvCacheTransferMSE", "tensorrt_llm::executor::DisServingRequestStats::kvCacheTransferMS"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfigE", "tensorrt_llm::executor::DynamicBatchConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig::batchSizeTable"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig::dynamicBatchMovingAverageWindow"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig::enableBatchSizeTuning"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig::enableMaxNumTokensTuning"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig17getBatchSizeTableEv", "tensorrt_llm::executor::DynamicBatchConfig::getBatchSizeTable"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig34getDynamicBatchMovingAverageWindowEv", "tensorrt_llm::executor::DynamicBatchConfig::getDynamicBatchMovingAverageWindow"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig24getEnableBatchSizeTuningEv", "tensorrt_llm::executor::DynamicBatchConfig::getEnableBatchSizeTuning"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig27getEnableMaxNumTokensTuningEv", "tensorrt_llm::executor::DynamicBatchConfig::getEnableMaxNumTokensTuning"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig22kDefaultBatchSizeTableE", "tensorrt_llm::executor::DynamicBatchConfig::kDefaultBatchSizeTable"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig39kDefaultDynamicBatchMovingAverageWindowE", "tensorrt_llm::executor::DynamicBatchConfig::kDefaultDynamicBatchMovingAverageWindow"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig15mBatchSizeTableE", "tensorrt_llm::executor::DynamicBatchConfig::mBatchSizeTable"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig32mDynamicBatchMovingAverageWindowE", "tensorrt_llm::executor::DynamicBatchConfig::mDynamicBatchMovingAverageWindow"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig22mEnableBatchSizeTuningE", "tensorrt_llm::executor::DynamicBatchConfig::mEnableBatchSizeTuning"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig25mEnableMaxNumTokensTuningE", "tensorrt_llm::executor::DynamicBatchConfig::mEnableMaxNumTokensTuning"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor12EagleChoicesE", "tensorrt_llm::executor::EagleChoices"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfigE", "tensorrt_llm::executor::EagleConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::dynamicTreeMaxTopK"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::eagleChoices"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::greedySampling"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::posteriorThreshold"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::useDynamicTree"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig19checkPosteriorValueERKNSt8optionalIfEE", "tensorrt_llm::executor::EagleConfig::checkPosteriorValue"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig19checkPosteriorValueERKNSt8optionalIfEE", "tensorrt_llm::executor::EagleConfig::checkPosteriorValue::value"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig21getDynamicTreeMaxTopKEv", "tensorrt_llm::executor::EagleConfig::getDynamicTreeMaxTopK"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig15getEagleChoicesEv", "tensorrt_llm::executor::EagleConfig::getEagleChoices"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig21getPosteriorThresholdEv", "tensorrt_llm::executor::EagleConfig::getPosteriorThreshold"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig16isGreedySamplingEv", "tensorrt_llm::executor::EagleConfig::isGreedySampling"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig19mDynamicTreeMaxTopKE", "tensorrt_llm::executor::EagleConfig::mDynamicTreeMaxTopK"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig13mEagleChoicesE", "tensorrt_llm::executor::EagleConfig::mEagleChoices"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig15mGreedySamplingE", "tensorrt_llm::executor::EagleConfig::mGreedySampling"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig19mPosteriorThresholdE", "tensorrt_llm::executor::EagleConfig::mPosteriorThreshold"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig15mUseDynamicTreeE", "tensorrt_llm::executor::EagleConfig::mUseDynamicTree"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfigeqERK11EagleConfig", "tensorrt_llm::executor::EagleConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfigeqERK11EagleConfig", "tensorrt_llm::executor::EagleConfig::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig14useDynamicTreeEv", "tensorrt_llm::executor::EagleConfig::useDynamicTree"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8ExecutorE", "tensorrt_llm::executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK8Executor", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERR8Executor", "tensorrt_llm::executor::Executor::Executor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::decoderEngineBuffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::decoderJsonConfigStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::decoderModel"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::decoderModelPath"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::encoderEngineBuffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::encoderJsonConfigStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::encoderModel"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::encoderModelPath"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::engineBuffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK8Executor", "tensorrt_llm::executor::Executor::Executor::executor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::jsonConfigStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::managedWeights"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::model"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::modelPath"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::modelType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::modelType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::modelType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::modelType"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERK6IdTypeRKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt6vectorI6IdTypeEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERK6IdTypeRKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::requestId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt6vectorI6IdTypeEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::requestIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERK6IdTypeRKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::timeout"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt6vectorI6IdTypeEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::timeout"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::timeout"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Executor18canEnqueueRequestsEv", "tensorrt_llm::executor::Executor::canEnqueueRequests"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor13cancelRequestE6IdType", "tensorrt_llm::executor::Executor::cancelRequest"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor13cancelRequestE6IdType", "tensorrt_llm::executor::Executor::cancelRequest::requestId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor14enqueueRequestERK7Request", "tensorrt_llm::executor::Executor::enqueueRequest"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14enqueueRequestERK7Request", "tensorrt_llm::executor::Executor::enqueueRequest::request"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor15enqueueRequestsERKNSt6vectorI7RequestEE", "tensorrt_llm::executor::Executor::enqueueRequests"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor15enqueueRequestsERKNSt6vectorI7RequestEE", "tensorrt_llm::executor::Executor::enqueueRequests::requests"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Executor22getKVCacheEventManagerEv", "tensorrt_llm::executor::Executor::getKVCacheEventManager"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor21getLatestDebugTensorsEv", "tensorrt_llm::executor::Executor::getLatestDebugTensors"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor23getLatestIterationStatsEv", "tensorrt_llm::executor::Executor::getLatestIterationStats"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor21getLatestRequestStatsEv", "tensorrt_llm::executor::Executor::getLatestRequestStats"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Executor20getNumResponsesReadyERKNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Executor::getNumResponsesReady"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8Executor20getNumResponsesReadyERKNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Executor::getNumResponsesReady::requestId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Executor13isParticipantEv", "tensorrt_llm::executor::Executor::isParticipant"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8Executor5mImplE", "tensorrt_llm::executor::Executor::mImpl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERK8Executor", "tensorrt_llm::executor::Executor::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERR8Executor", "tensorrt_llm::executor::Executor::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERK8Executor", "tensorrt_llm::executor::Executor::operator=::executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8shutdownEv", "tensorrt_llm::executor::Executor::shutdown"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ExecutorD0Ev", "tensorrt_llm::executor::Executor::~Executor"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfigE", "tensorrt_llm::executor::ExecutorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::additionalModelOutputs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::batchingType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::cacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::debugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::decodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::enableChunkedContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::enableTrtOverlap"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::extendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::gatherGenerationLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::gpuWeightsPercent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::guidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::iterStatsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::kvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::logitsPostProcessorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxBatchSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxBeamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxNumTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxQueueSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxSeqIdleMicroseconds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::normalizeLogProbs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::parallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::peftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::promptTableOffloading"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::recvPollPeriodMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::requestStatsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::schedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::specDecConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::useGpuDirectStorage"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getAdditionalModelOutputsEv", "tensorrt_llm::executor::ExecutorConfig::getAdditionalModelOutputs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getBatchingTypeEv", "tensorrt_llm::executor::ExecutorConfig::getBatchingType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getCacheTransceiverConfigEv", "tensorrt_llm::executor::ExecutorConfig::getCacheTransceiverConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig14getDebugConfigEv", "tensorrt_llm::executor::ExecutorConfig::getDebugConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig17getDecodingConfigEv", "tensorrt_llm::executor::ExecutorConfig::getDecodingConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig23getEnableChunkedContextEv", "tensorrt_llm::executor::ExecutorConfig::getEnableChunkedContext"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig19getEnableTrtOverlapEv", "tensorrt_llm::executor::ExecutorConfig::getEnableTrtOverlap"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig32getExtendedRuntimePerfKnobConfigEv", "tensorrt_llm::executor::ExecutorConfig::getExtendedRuntimePerfKnobConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getGatherGenerationLogitsEv", "tensorrt_llm::executor::ExecutorConfig::getGatherGenerationLogits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig20getGpuWeightsPercentEv", "tensorrt_llm::executor::ExecutorConfig::getGpuWeightsPercent"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig23getGuidedDecodingConfigEv", "tensorrt_llm::executor::ExecutorConfig::getGuidedDecodingConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getIterStatsMaxIterationsEv", "tensorrt_llm::executor::ExecutorConfig::getIterStatsMaxIterations"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig16getKvCacheConfigEv", "tensorrt_llm::executor::ExecutorConfig::getKvCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19getKvCacheConfigRefEv", "tensorrt_llm::executor::ExecutorConfig::getKvCacheConfigRef"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig28getLogitsPostProcessorConfigEv", "tensorrt_llm::executor::ExecutorConfig::getLogitsPostProcessorConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxBatchSizeEv", "tensorrt_llm::executor::ExecutorConfig::getMaxBatchSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxBeamWidthEv", "tensorrt_llm::executor::ExecutorConfig::getMaxBeamWidth"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxNumTokensEv", "tensorrt_llm::executor::ExecutorConfig::getMaxNumTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxQueueSizeEv", "tensorrt_llm::executor::ExecutorConfig::getMaxQueueSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getMaxSeqIdleMicrosecondsEv", "tensorrt_llm::executor::ExecutorConfig::getMaxSeqIdleMicroseconds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig20getNormalizeLogProbsEv", "tensorrt_llm::executor::ExecutorConfig::getNormalizeLogProbs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig17getParallelConfigEv", "tensorrt_llm::executor::ExecutorConfig::getParallelConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig18getPeftCacheConfigEv", "tensorrt_llm::executor::ExecutorConfig::getPeftCacheConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig24getPromptTableOffloadingEv", "tensorrt_llm::executor::ExecutorConfig::getPromptTableOffloading"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig19getRecvPollPeriodMsEv", "tensorrt_llm::executor::ExecutorConfig::getRecvPollPeriodMs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig28getRequestStatsMaxIterationsEv", "tensorrt_llm::executor::ExecutorConfig::getRequestStatsMaxIterations"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig18getSchedulerConfigEv", "tensorrt_llm::executor::ExecutorConfig::getSchedulerConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21getSchedulerConfigRefEv", "tensorrt_llm::executor::ExecutorConfig::getSchedulerConfigRef"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig16getSpecDecConfigEv", "tensorrt_llm::executor::ExecutorConfig::getSpecDecConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig22getUseGpuDirectStorageEv", "tensorrt_llm::executor::ExecutorConfig::getUseGpuDirectStorage"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30kDefaultIterStatsMaxIterationsE", "tensorrt_llm::executor::ExecutorConfig::kDefaultIterStatsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30kDefaultMaxSeqIdleMicrosecondsE", "tensorrt_llm::executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig33kDefaultRequestStatsMaxIterationsE", "tensorrt_llm::executor::ExecutorConfig::kDefaultRequestStatsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mAdditionalModelOutputsE", "tensorrt_llm::executor::ExecutorConfig::mAdditionalModelOutputs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mBatchingTypeE", "tensorrt_llm::executor::ExecutorConfig::mBatchingType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mCacheTransceiverConfigE", "tensorrt_llm::executor::ExecutorConfig::mCacheTransceiverConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig12mDebugConfigE", "tensorrt_llm::executor::ExecutorConfig::mDebugConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15mDecodingConfigE", "tensorrt_llm::executor::ExecutorConfig::mDecodingConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21mEnableChunkedContextE", "tensorrt_llm::executor::ExecutorConfig::mEnableChunkedContext"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17mEnableTrtOverlapE", "tensorrt_llm::executor::ExecutorConfig::mEnableTrtOverlap"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30mExtendedRuntimePerfKnobConfigE", "tensorrt_llm::executor::ExecutorConfig::mExtendedRuntimePerfKnobConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mGatherGenerationLogitsE", "tensorrt_llm::executor::ExecutorConfig::mGatherGenerationLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18mGpuWeightsPercentE", "tensorrt_llm::executor::ExecutorConfig::mGpuWeightsPercent"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21mGuidedDecodingConfigE", "tensorrt_llm::executor::ExecutorConfig::mGuidedDecodingConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mIterStatsMaxIterationsE", "tensorrt_llm::executor::ExecutorConfig::mIterStatsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14mKvCacheConfigE", "tensorrt_llm::executor::ExecutorConfig::mKvCacheConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mLogitsPostProcessorConfigE", "tensorrt_llm::executor::ExecutorConfig::mLogitsPostProcessorConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxBatchSizeE", "tensorrt_llm::executor::ExecutorConfig::mMaxBatchSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxBeamWidthE", "tensorrt_llm::executor::ExecutorConfig::mMaxBeamWidth"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxNumTokensE", "tensorrt_llm::executor::ExecutorConfig::mMaxNumTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxQueueSizeE", "tensorrt_llm::executor::ExecutorConfig::mMaxQueueSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mMaxSeqIdleMicrosecondsE", "tensorrt_llm::executor::ExecutorConfig::mMaxSeqIdleMicroseconds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18mNormalizeLogProbsE", "tensorrt_llm::executor::ExecutorConfig::mNormalizeLogProbs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15mParallelConfigE", "tensorrt_llm::executor::ExecutorConfig::mParallelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16mPeftCacheConfigE", "tensorrt_llm::executor::ExecutorConfig::mPeftCacheConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22mPromptTableOffloadingE", "tensorrt_llm::executor::ExecutorConfig::mPromptTableOffloading"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17mRecvPollPeriodMsE", "tensorrt_llm::executor::ExecutorConfig::mRecvPollPeriodMs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mRequestStatsMaxIterationsE", "tensorrt_llm::executor::ExecutorConfig::mRequestStatsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16mSchedulerConfigE", "tensorrt_llm::executor::ExecutorConfig::mSchedulerConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mSpeculativeDecodingConfigE", "tensorrt_llm::executor::ExecutorConfig::mSpeculativeDecodingConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20mUseGpuDirectStorageE", "tensorrt_llm::executor::ExecutorConfig::mUseGpuDirectStorage"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setAdditionalModelOutputsERKNSt6vectorI21AdditionalModelOutputEE", "tensorrt_llm::executor::ExecutorConfig::setAdditionalModelOutputs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setAdditionalModelOutputsERKNSt6vectorI21AdditionalModelOutputEE", "tensorrt_llm::executor::ExecutorConfig::setAdditionalModelOutputs::additionalModelOutputs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setBatchingTypeE12BatchingType", "tensorrt_llm::executor::ExecutorConfig::setBatchingType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setBatchingTypeE12BatchingType", "tensorrt_llm::executor::ExecutorConfig::setBatchingType::batchingType"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setCacheTransceiverConfigERK22CacheTransceiverConfig", "tensorrt_llm::executor::ExecutorConfig::setCacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setCacheTransceiverConfigERK22CacheTransceiverConfig", "tensorrt_llm::executor::ExecutorConfig::setCacheTransceiverConfig::cacheTransceiverConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14setDebugConfigERK11DebugConfig", "tensorrt_llm::executor::ExecutorConfig::setDebugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14setDebugConfigERK11DebugConfig", "tensorrt_llm::executor::ExecutorConfig::setDebugConfig::debugConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setDecodingConfigERK14DecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setDecodingConfigERK14DecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setDecodingConfig::decodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setEnableChunkedContextEb", "tensorrt_llm::executor::ExecutorConfig::setEnableChunkedContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setEnableChunkedContextEb", "tensorrt_llm::executor::ExecutorConfig::setEnableChunkedContext::enableChunkedContext"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setEnableTrtOverlapEb", "tensorrt_llm::executor::ExecutorConfig::setEnableTrtOverlap"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setEnableTrtOverlapEb", "tensorrt_llm::executor::ExecutorConfig::setEnableTrtOverlap::enableTrtOverlap"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig32setExtendedRuntimePerfKnobConfigERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::ExecutorConfig::setExtendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig32setExtendedRuntimePerfKnobConfigERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::ExecutorConfig::setExtendedRuntimePerfKnobConfig::extendedRuntimePerfKnobConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setGatherGenerationLogitsEb", "tensorrt_llm::executor::ExecutorConfig::setGatherGenerationLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setGatherGenerationLogitsEb", "tensorrt_llm::executor::ExecutorConfig::setGatherGenerationLogits::gatherGenerationLogits"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setGpuWeightsPercentERKf", "tensorrt_llm::executor::ExecutorConfig::setGpuWeightsPercent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setGpuWeightsPercentERKf", "tensorrt_llm::executor::ExecutorConfig::setGpuWeightsPercent::gpuWeightsPercent"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setGuidedDecodingConfigERK20GuidedDecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setGuidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setGuidedDecodingConfigERK20GuidedDecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setGuidedDecodingConfig::guidedDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setIterStatsMaxIterationsE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setIterStatsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setIterStatsMaxIterationsE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setIterStatsMaxIterations::iterStatsMaxIterations"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setKvCacheConfigERK13KvCacheConfig", "tensorrt_llm::executor::ExecutorConfig::setKvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setKvCacheConfigERK13KvCacheConfig", "tensorrt_llm::executor::ExecutorConfig::setKvCacheConfig::kvCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setLogitsPostProcessorConfigERK25LogitsPostProcessorConfig", "tensorrt_llm::executor::ExecutorConfig::setLogitsPostProcessorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setLogitsPostProcessorConfigERK25LogitsPostProcessorConfig", "tensorrt_llm::executor::ExecutorConfig::setLogitsPostProcessorConfig::logitsPostProcessorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBatchSizeE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxBatchSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBatchSizeE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxBatchSize::maxBatchSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBeamWidthE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxBeamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBeamWidthE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxBeamWidth::maxBeamWidth"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxNumTokensE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxNumTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxNumTokensE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxNumTokens::maxNumTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxQueueSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ExecutorConfig::setMaxQueueSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxQueueSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ExecutorConfig::setMaxQueueSize::maxQueueSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setMaxSeqIdleMicrosecondsE8uint64_t", "tensorrt_llm::executor::ExecutorConfig::setMaxSeqIdleMicroseconds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setMaxSeqIdleMicrosecondsE8uint64_t", "tensorrt_llm::executor::ExecutorConfig::setMaxSeqIdleMicroseconds::maxSeqIdleMicroseconds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setNormalizeLogProbsEb", "tensorrt_llm::executor::ExecutorConfig::setNormalizeLogProbs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setNormalizeLogProbsEb", "tensorrt_llm::executor::ExecutorConfig::setNormalizeLogProbs::normalizeLogProbs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setParallelConfigERK14ParallelConfig", "tensorrt_llm::executor::ExecutorConfig::setParallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setParallelConfigERK14ParallelConfig", "tensorrt_llm::executor::ExecutorConfig::setParallelConfig::parallelConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setPeftCacheConfigERK15PeftCacheConfig", "tensorrt_llm::executor::ExecutorConfig::setPeftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setPeftCacheConfigERK15PeftCacheConfig", "tensorrt_llm::executor::ExecutorConfig::setPeftCacheConfig::peftCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig24setPromptTableOffloadingEb", "tensorrt_llm::executor::ExecutorConfig::setPromptTableOffloading"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig24setPromptTableOffloadingEb", "tensorrt_llm::executor::ExecutorConfig::setPromptTableOffloading::promptTableOffloading"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setRecvPollPeriodMsERK10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setRecvPollPeriodMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setRecvPollPeriodMsERK10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setRecvPollPeriodMs::recvPollPeriodMs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setRequestStatsMaxIterationsE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setRequestStatsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setRequestStatsMaxIterationsE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setRequestStatsMaxIterations::requestStatsMaxIterations"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setSchedulerConfigERK15SchedulerConfig", "tensorrt_llm::executor::ExecutorConfig::setSchedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setSchedulerConfigERK15SchedulerConfig", "tensorrt_llm::executor::ExecutorConfig::setSchedulerConfig::schedulerConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setSpecDecConfigERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setSpecDecConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setSpecDecConfigERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setSpecDecConfig::specDecConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22setUseGpuDirectStorageERKb", "tensorrt_llm::executor::ExecutorConfig::setUseGpuDirectStorage"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22setUseGpuDirectStorageERKb", "tensorrt_llm::executor::ExecutorConfig::setUseGpuDirectStorage::useGpuDirectStorage"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig::cudaGraphCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig::cudaGraphMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig::enableContextFMHAFP32Acc"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig::multiBlockMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21getCudaGraphCacheSizeEv", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getCudaGraphCacheSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16getCudaGraphModeEv", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getCudaGraphMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27getEnableContextFMHAFP32AccEv", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getEnableContextFMHAFP32Acc"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17getMultiBlockModeEv", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getMultiBlockMode"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig19mCudaGraphCacheSizeE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mCudaGraphCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig14mCudaGraphModeE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mCudaGraphMode"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig25mEnableContextFMHAFP32AccE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mEnableContextFMHAFP32Acc"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig15mMultiBlockModeE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mMultiBlockMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigeqERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigeqERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21setCudaGraphCacheSizeE10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21setCudaGraphCacheSizeE10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize::cacheSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16setCudaGraphModeEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16setCudaGraphModeEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphMode::cudaGraphMode"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27setEnableContextFMHAFP32AccEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27setEnableContextFMHAFP32AccEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc::enableContextFMHAFP32Acc"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17setMultiBlockModeEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setMultiBlockMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17setMultiBlockModeEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setMultiBlockMode::multiBlockMode"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfigE", "tensorrt_llm::executor::ExternalDraftTokensConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig::acceptanceThreshold"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig::fastLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig::logits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig::tokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig22getAcceptanceThresholdEv", "tensorrt_llm::executor::ExternalDraftTokensConfig::getAcceptanceThreshold"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig13getFastLogitsEv", "tensorrt_llm::executor::ExternalDraftTokensConfig::getFastLogits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig9getLogitsEv", "tensorrt_llm::executor::ExternalDraftTokensConfig::getLogits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig9getTokensEv", "tensorrt_llm::executor::ExternalDraftTokensConfig::getTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig20mAcceptanceThresholdE", "tensorrt_llm::executor::ExternalDraftTokensConfig::mAcceptanceThreshold"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig11mFastLogitsE", "tensorrt_llm::executor::ExternalDraftTokensConfig::mFastLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig7mLogitsE", "tensorrt_llm::executor::ExternalDraftTokensConfig::mLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig7mTokensE", "tensorrt_llm::executor::ExternalDraftTokensConfig::mTokens"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor12FinishReasonE", "tensorrt_llm::executor::FinishReason"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason10kCANCELLEDE", "tensorrt_llm::executor::FinishReason::kCANCELLED"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason7kEND_IDE", "tensorrt_llm::executor::FinishReason::kEND_ID"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason7kLENGTHE", "tensorrt_llm::executor::FinishReason::kLENGTH"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason13kNOT_FINISHEDE", "tensorrt_llm::executor::FinishReason::kNOT_FINISHED"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason11kSTOP_WORDSE", "tensorrt_llm::executor::FinishReason::kSTOP_WORDS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason10kTIMED_OUTE", "tensorrt_llm::executor::FinishReason::kTIMED_OUT"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor9FloatTypeE", "tensorrt_llm::executor::FloatType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfigE", "tensorrt_llm::executor::GuidedDecodingConfig"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackendE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingBackend"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend9kXGRAMMARE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig::backend"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig::encodedVocab"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig::stopTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig::tokenizerStr"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig10getBackendEv", "tensorrt_llm::executor::GuidedDecodingConfig::getBackend"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getEncodedVocabEv", "tensorrt_llm::executor::GuidedDecodingConfig::getEncodedVocab"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getStopTokenIdsEv", "tensorrt_llm::executor::GuidedDecodingConfig::getStopTokenIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getTokenizerStrEv", "tensorrt_llm::executor::GuidedDecodingConfig::getTokenizerStr"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig8mBackendE", "tensorrt_llm::executor::GuidedDecodingConfig::mBackend"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mEncodedVocabE", "tensorrt_llm::executor::GuidedDecodingConfig::mEncodedVocab"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mStopTokenIdsE", "tensorrt_llm::executor::GuidedDecodingConfig::mStopTokenIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mTokenizerStrE", "tensorrt_llm::executor::GuidedDecodingConfig::mTokenizerStr"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfigeqERK20GuidedDecodingConfig", "tensorrt_llm::executor::GuidedDecodingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfigeqERK20GuidedDecodingConfig", "tensorrt_llm::executor::GuidedDecodingConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig10setBackendERK21GuidedDecodingBackend", "tensorrt_llm::executor::GuidedDecodingConfig::setBackend"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig10setBackendERK21GuidedDecodingBackend", "tensorrt_llm::executor::GuidedDecodingConfig::setBackend::backend"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setEncodedVocabERKNSt6vectorINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingConfig::setEncodedVocab"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setEncodedVocabERKNSt6vectorINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingConfig::setEncodedVocab::encodedVocab"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setStopTokenIdsERKNSt6vectorI11TokenIdTypeEE", "tensorrt_llm::executor::GuidedDecodingConfig::setStopTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setStopTokenIdsERKNSt6vectorI11TokenIdTypeEE", "tensorrt_llm::executor::GuidedDecodingConfig::setStopTokenIds::stopTokenIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setTokenizerStrERKNSt6stringE", "tensorrt_llm::executor::GuidedDecodingConfig::setTokenizerStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setTokenizerStrERKNSt6stringE", "tensorrt_llm::executor::GuidedDecodingConfig::setTokenizerStr::tokenizerStr"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig8validateEv", "tensorrt_llm::executor::GuidedDecodingConfig::validate"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParamsE", "tensorrt_llm::executor::GuidedDecodingParams"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideTypeE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType13kEBNF_GRAMMARE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType5kJSONE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kJSON"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType12kJSON_SCHEMAE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kJSON_SCHEMA"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType6kREGEXE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kREGEX"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType15kSTRUCTURAL_TAGE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams20GuidedDecodingParamsE9GuideTypeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingParams::GuidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams20GuidedDecodingParamsE9GuideTypeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingParams::GuidedDecodingParams::guide"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams20GuidedDecodingParamsE9GuideTypeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingParams::GuidedDecodingParams::guideType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParams8getGuideEv", "tensorrt_llm::executor::GuidedDecodingParams::getGuide"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParams12getGuideTypeEv", "tensorrt_llm::executor::GuidedDecodingParams::getGuideType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams6mGuideE", "tensorrt_llm::executor::GuidedDecodingParams::mGuide"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams10mGuideTypeE", "tensorrt_llm::executor::GuidedDecodingParams::mGuideType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParamseqERK20GuidedDecodingParams", "tensorrt_llm::executor::GuidedDecodingParams::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParamseqERK20GuidedDecodingParams", "tensorrt_llm::executor::GuidedDecodingParams::operator==::other"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6IdTypeE", "tensorrt_llm::executor::IdType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStatsE", "tensorrt_llm::executor::InflightBatchingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats26avgNumDecodedTokensPerIterE", "tensorrt_llm::executor::InflightBatchingStats::avgNumDecodedTokensPerIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats12microBatchIdE", "tensorrt_llm::executor::InflightBatchingStats::microBatchId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats18numContextRequestsE", "tensorrt_llm::executor::InflightBatchingStats::numContextRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats12numCtxTokensE", "tensorrt_llm::executor::InflightBatchingStats::numCtxTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats14numGenRequestsE", "tensorrt_llm::executor::InflightBatchingStats::numGenRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats17numPausedRequestsE", "tensorrt_llm::executor::InflightBatchingStats::numPausedRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats20numScheduledRequestsE", "tensorrt_llm::executor::InflightBatchingStats::numScheduledRequests"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14IterationStatsE", "tensorrt_llm::executor::IterationStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats11cpuMemUsageE", "tensorrt_llm::executor::IterationStats::cpuMemUsage"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats17crossKvCacheStatsE", "tensorrt_llm::executor::IterationStats::crossKvCacheStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats11gpuMemUsageE", "tensorrt_llm::executor::IterationStats::gpuMemUsage"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats21inflightBatchingStatsE", "tensorrt_llm::executor::IterationStats::inflightBatchingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats4iterE", "tensorrt_llm::executor::IterationStats::iter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats13iterLatencyMSE", "tensorrt_llm::executor::IterationStats::iterLatencyMS"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats12kvCacheStatsE", "tensorrt_llm::executor::IterationStats::kvCacheStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats19maxBatchSizeRuntimeE", "tensorrt_llm::executor::IterationStats::maxBatchSizeRuntime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats18maxBatchSizeStaticE", "tensorrt_llm::executor::IterationStats::maxBatchSizeStatic"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats28maxBatchSizeTunerRecommendedE", "tensorrt_llm::executor::IterationStats::maxBatchSizeTunerRecommended"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats20maxNumActiveRequestsE", "tensorrt_llm::executor::IterationStats::maxNumActiveRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats19maxNumTokensRuntimeE", "tensorrt_llm::executor::IterationStats::maxNumTokensRuntime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats18maxNumTokensStaticE", "tensorrt_llm::executor::IterationStats::maxNumTokensStatic"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats28maxNumTokensTunerRecommendedE", "tensorrt_llm::executor::IterationStats::maxNumTokensTunerRecommended"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats31newActiveRequestsQueueLatencyMSE", "tensorrt_llm::executor::IterationStats::newActiveRequestsQueueLatencyMS"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats17numActiveRequestsE", "tensorrt_llm::executor::IterationStats::numActiveRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats20numCompletedRequestsE", "tensorrt_llm::executor::IterationStats::numCompletedRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats20numNewActiveRequestsE", "tensorrt_llm::executor::IterationStats::numNewActiveRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats17numQueuedRequestsE", "tensorrt_llm::executor::IterationStats::numQueuedRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats14pinnedMemUsageE", "tensorrt_llm::executor::IterationStats::pinnedMemUsage"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats19staticBatchingStatsE", "tensorrt_llm::executor::IterationStats::staticBatchingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats9timestampE", "tensorrt_llm::executor::IterationStats::timestamp"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor13IterationTypeE", "tensorrt_llm::executor::IterationType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerializationE", "tensorrt_llm::executor::JsonSerialization"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK12RequestStats", "tensorrt_llm::executor::JsonSerialization::toJsonStr"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK14IterationStats", "tensorrt_llm::executor::JsonSerialization::toJsonStr"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK24RequestStatsPerIteration", "tensorrt_llm::executor::JsonSerialization::toJsonStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK14IterationStats", "tensorrt_llm::executor::JsonSerialization::toJsonStr::iterationStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK12RequestStats", "tensorrt_llm::executor::JsonSerialization::toJsonStr::requestStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK24RequestStatsPerIteration", "tensorrt_llm::executor::JsonSerialization::toJsonStr::requestStatsPerIter"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheCreatedDataE", "tensorrt_llm::executor::KVCacheCreatedData"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheCreatedData22numBlocksPerCacheLevelE", "tensorrt_llm::executor::KVCacheCreatedData::numBlocksPerCacheLevel"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEventE", "tensorrt_llm::executor::KVCacheEvent"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent12KVCacheEventE6IdType16KVCacheEventData", "tensorrt_llm::executor::KVCacheEvent::KVCacheEvent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent12KVCacheEventE6IdType16KVCacheEventData", "tensorrt_llm::executor::KVCacheEvent::KVCacheEvent::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent12KVCacheEventE6IdType16KVCacheEventData", "tensorrt_llm::executor::KVCacheEvent::KVCacheEvent::eventId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent4dataE", "tensorrt_llm::executor::KVCacheEvent::data"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent7eventIdE", "tensorrt_llm::executor::KVCacheEvent::eventId"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDataE", "tensorrt_llm::executor::KVCacheEventData"], [0, 2, 1, "_CPPv4I0EN12tensorrt_llm8executor16KVCacheEventDiffE", "tensorrt_llm::executor::KVCacheEventDiff"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor16KVCacheEventDiffE", "tensorrt_llm::executor::KVCacheEventDiff::T"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDiff8newValueE", "tensorrt_llm::executor::KVCacheEventDiff::newValue"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDiff8oldValueE", "tensorrt_llm::executor::KVCacheEventDiff::oldValue"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManagerE", "tensorrt_llm::executor::KVCacheEventManager"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager19KVCacheEventManagerENSt10shared_ptrIN12tensorrt_llm13batch_manager16kv_cache_manager18BaseKVCacheManagerEEE", "tensorrt_llm::executor::KVCacheEventManager::KVCacheEventManager"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager19KVCacheEventManagerENSt10shared_ptrIN12tensorrt_llm13batch_manager16kv_cache_manager18BaseKVCacheManagerEEE", "tensorrt_llm::executor::KVCacheEventManager::KVCacheEventManager::kvCacheManager"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager15getLatestEventsENSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KVCacheEventManager::getLatestEvents"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager15getLatestEventsENSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KVCacheEventManager::getLatestEvents::timeout"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager14kvCacheManagerE", "tensorrt_llm::executor::KVCacheEventManager::kvCacheManager"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheRemovedDataE", "tensorrt_llm::executor::KVCacheRemovedData"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheRemovedData11blockHashesE", "tensorrt_llm::executor::KVCacheRemovedData::blockHashes"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockDataE", "tensorrt_llm::executor::KVCacheStoredBlockData"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::blockHash"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::cacheLevel"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::loraId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::priority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::tokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData9blockHashE", "tensorrt_llm::executor::KVCacheStoredBlockData::blockHash"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData10cacheLevelE", "tensorrt_llm::executor::KVCacheStoredBlockData::cacheLevel"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData6loraIdE", "tensorrt_llm::executor::KVCacheStoredBlockData::loraId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData8priorityE", "tensorrt_llm::executor::KVCacheStoredBlockData::priority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData6tokensE", "tensorrt_llm::executor::KVCacheStoredBlockData::tokens"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredDataE", "tensorrt_llm::executor::KVCacheStoredData"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredData6blocksE", "tensorrt_llm::executor::KVCacheStoredData::blocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredData10parentHashE", "tensorrt_llm::executor::KVCacheStoredData::parentHash"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedDataE", "tensorrt_llm::executor::KVCacheUpdatedData"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData18KVCacheUpdatedDataE6IdType", "tensorrt_llm::executor::KVCacheUpdatedData::KVCacheUpdatedData"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData18KVCacheUpdatedDataE6IdType", "tensorrt_llm::executor::KVCacheUpdatedData::KVCacheUpdatedData::blockHash"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData9blockHashE", "tensorrt_llm::executor::KVCacheUpdatedData::blockHash"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData10cacheLevelE", "tensorrt_llm::executor::KVCacheUpdatedData::cacheLevel"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData17cacheLevelUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::cacheLevelUpdated"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData17cacheLevelUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::cacheLevelUpdated::newValue"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData17cacheLevelUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::cacheLevelUpdated::oldValue"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData8priorityE", "tensorrt_llm::executor::KVCacheUpdatedData::priority"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData15priorityUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::priorityUpdated"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData15priorityUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::priorityUpdated::newValue"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData15priorityUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::priorityUpdated::oldValue"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfigE", "tensorrt_llm::executor::KvCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::copyOnPartialReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::crossKvCacheFraction"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::enableBlockReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::enablePartialReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::eventBufferMaxSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::freeGpuMemoryFraction"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::hostCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::maxAttentionWindowVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::maxTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::onboardBlocks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::runtimeDefaults"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::secondaryOffloadMinPriority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::sinkTokenLength"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig34fillEmptyFieldsFromRuntimeDefaultsEN12tensorrt_llm7runtime15RuntimeDefaultsE", "tensorrt_llm::executor::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig34fillEmptyFieldsFromRuntimeDefaultsEN12tensorrt_llm7runtime15RuntimeDefaultsE", "tensorrt_llm::executor::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults::runtimeDefaults"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getCopyOnPartialReuseEv", "tensorrt_llm::executor::KvCacheConfig::getCopyOnPartialReuse"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig23getCrossKvCacheFractionEv", "tensorrt_llm::executor::KvCacheConfig::getCrossKvCacheFraction"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig19getEnableBlockReuseEv", "tensorrt_llm::executor::KvCacheConfig::getEnableBlockReuse"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getEnablePartialReuseEv", "tensorrt_llm::executor::KvCacheConfig::getEnablePartialReuse"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getEventBufferMaxSizeEv", "tensorrt_llm::executor::KvCacheConfig::getEventBufferMaxSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig24getFreeGpuMemoryFractionEv", "tensorrt_llm::executor::KvCacheConfig::getFreeGpuMemoryFraction"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig16getHostCacheSizeEv", "tensorrt_llm::executor::KvCacheConfig::getHostCacheSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig24getMaxAttentionWindowVecEv", "tensorrt_llm::executor::KvCacheConfig::getMaxAttentionWindowVec"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig12getMaxTokensEv", "tensorrt_llm::executor::KvCacheConfig::getMaxTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig16getOnboardBlocksEv", "tensorrt_llm::executor::KvCacheConfig::getOnboardBlocks"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig30getSecondaryOffloadMinPriorityEv", "tensorrt_llm::executor::KvCacheConfig::getSecondaryOffloadMinPriority"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig18getSinkTokenLengthEv", "tensorrt_llm::executor::KvCacheConfig::getSinkTokenLength"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mCopyOnPartialReuseE", "tensorrt_llm::executor::KvCacheConfig::mCopyOnPartialReuse"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21mCrossKvCacheFractionE", "tensorrt_llm::executor::KvCacheConfig::mCrossKvCacheFraction"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig17mEnableBlockReuseE", "tensorrt_llm::executor::KvCacheConfig::mEnableBlockReuse"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mEnablePartialReuseE", "tensorrt_llm::executor::KvCacheConfig::mEnablePartialReuse"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mEventBufferMaxSizeE", "tensorrt_llm::executor::KvCacheConfig::mEventBufferMaxSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig22mFreeGpuMemoryFractionE", "tensorrt_llm::executor::KvCacheConfig::mFreeGpuMemoryFraction"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig14mHostCacheSizeE", "tensorrt_llm::executor::KvCacheConfig::mHostCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig22mMaxAttentionWindowVecE", "tensorrt_llm::executor::KvCacheConfig::mMaxAttentionWindowVec"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig10mMaxTokensE", "tensorrt_llm::executor::KvCacheConfig::mMaxTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig14mOnboardBlocksE", "tensorrt_llm::executor::KvCacheConfig::mOnboardBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig28mSecondaryOffloadMinPriorityE", "tensorrt_llm::executor::KvCacheConfig::mSecondaryOffloadMinPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16mSinkTokenLengthE", "tensorrt_llm::executor::KvCacheConfig::mSinkTokenLength"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setCopyOnPartialReuseEb", "tensorrt_llm::executor::KvCacheConfig::setCopyOnPartialReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setCopyOnPartialReuseEb", "tensorrt_llm::executor::KvCacheConfig::setCopyOnPartialReuse::copyOnPartialReuse"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig23setCrossKvCacheFractionE9FloatType", "tensorrt_llm::executor::KvCacheConfig::setCrossKvCacheFraction"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig23setCrossKvCacheFractionE9FloatType", "tensorrt_llm::executor::KvCacheConfig::setCrossKvCacheFraction::crossKvCacheFraction"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19setEnableBlockReuseEb", "tensorrt_llm::executor::KvCacheConfig::setEnableBlockReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19setEnableBlockReuseEb", "tensorrt_llm::executor::KvCacheConfig::setEnableBlockReuse::enableBlockReuse"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEnablePartialReuseEb", "tensorrt_llm::executor::KvCacheConfig::setEnablePartialReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEnablePartialReuseEb", "tensorrt_llm::executor::KvCacheConfig::setEnablePartialReuse::enablePartialReuse"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEventBufferMaxSizeE6size_t", "tensorrt_llm::executor::KvCacheConfig::setEventBufferMaxSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEventBufferMaxSizeE6size_t", "tensorrt_llm::executor::KvCacheConfig::setEventBufferMaxSize::eventBufferMaxSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setFreeGpuMemoryFractionE9FloatType", "tensorrt_llm::executor::KvCacheConfig::setFreeGpuMemoryFraction"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setFreeGpuMemoryFractionE9FloatType", "tensorrt_llm::executor::KvCacheConfig::setFreeGpuMemoryFraction::freeGpuMemoryFraction"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setHostCacheSizeE6size_t", "tensorrt_llm::executor::KvCacheConfig::setHostCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setHostCacheSizeE6size_t", "tensorrt_llm::executor::KvCacheConfig::setHostCacheSize::hostCacheSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setMaxAttentionWindowVecENSt6vectorI10SizeType32EE", "tensorrt_llm::executor::KvCacheConfig::setMaxAttentionWindowVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setMaxAttentionWindowVecENSt6vectorI10SizeType32EE", "tensorrt_llm::executor::KvCacheConfig::setMaxAttentionWindowVec::maxAttentionWindowVec"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig12setMaxTokensE10SizeType32", "tensorrt_llm::executor::KvCacheConfig::setMaxTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig12setMaxTokensE10SizeType32", "tensorrt_llm::executor::KvCacheConfig::setMaxTokens::maxTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setOnboardBlocksEb", "tensorrt_llm::executor::KvCacheConfig::setOnboardBlocks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setOnboardBlocksEb", "tensorrt_llm::executor::KvCacheConfig::setOnboardBlocks::onboardBlocks"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig30setSecondaryOffloadMinPriorityENSt8optionalI17RetentionPriorityEE", "tensorrt_llm::executor::KvCacheConfig::setSecondaryOffloadMinPriority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig30setSecondaryOffloadMinPriorityENSt8optionalI17RetentionPriorityEE", "tensorrt_llm::executor::KvCacheConfig::setSecondaryOffloadMinPriority::secondaryOffloadMinPriority"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig18setSinkTokenLengthE10SizeType32", "tensorrt_llm::executor::KvCacheConfig::setSinkTokenLength"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig18setSinkTokenLengthE10SizeType32", "tensorrt_llm::executor::KvCacheConfig::setSinkTokenLength::sinkTokenLength"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfigE", "tensorrt_llm::executor::KvCacheRetentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigEv", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig::decodeDurationMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig::decodeRetentionPriority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig::tokenRangeRetentionPriorities"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig::durationMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig::priority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig::tokenEnd"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig::tokenStart"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig10durationMsE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::durationMs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigeqERK25TokenRangeRetentionConfig", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigeqERK25TokenRangeRetentionConfig", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::operator==::other"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig8priorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::priority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig8tokenEndE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenEnd"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig10tokenStartE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenStart"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig19getDecodeDurationMsEv", "tensorrt_llm::executor::KvCacheRetentionConfig::getDecodeDurationMs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig26getDecodeRetentionPriorityEv", "tensorrt_llm::executor::KvCacheRetentionConfig::getDecodeRetentionPriority"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32", "tensorrt_llm::executor::KvCacheRetentionConfig::getPerBlockRetentionPriorityDuration"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32", "tensorrt_llm::executor::KvCacheRetentionConfig::getPerBlockRetentionPriorityDuration::blockSize"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32", "tensorrt_llm::executor::KvCacheRetentionConfig::getPerBlockRetentionPriorityDuration::seqLen"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig29getTokenRangeRetentionConfigsEv", "tensorrt_llm::executor::KvCacheRetentionConfig::getTokenRangeRetentionConfigs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25kDefaultRetentionPriorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::kDefaultRetentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMaxRetentionPriorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::kMaxRetentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMinRetentionPriorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::kMinRetentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig17mDecodeDurationMsE", "tensorrt_llm::executor::KvCacheRetentionConfig::mDecodeDurationMs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig24mDecodeRetentionPriorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::mDecodeRetentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig27mTokenRangeRetentionConfigsE", "tensorrt_llm::executor::KvCacheRetentionConfig::mTokenRangeRetentionConfigs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfigeqERK22KvCacheRetentionConfig", "tensorrt_llm::executor::KvCacheRetentionConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfigeqERK22KvCacheRetentionConfig", "tensorrt_llm::executor::KvCacheRetentionConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStatsE", "tensorrt_llm::executor::KvCacheStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats14allocNewBlocksE", "tensorrt_llm::executor::KvCacheStats::allocNewBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats16allocTotalBlocksE", "tensorrt_llm::executor::KvCacheStats::allocTotalBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12cacheHitRateE", "tensorrt_llm::executor::KvCacheStats::cacheHitRate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats13freeNumBlocksE", "tensorrt_llm::executor::KvCacheStats::freeNumBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12maxNumBlocksE", "tensorrt_llm::executor::KvCacheStats::maxNumBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12missedBlocksE", "tensorrt_llm::executor::KvCacheStats::missedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12reusedBlocksE", "tensorrt_llm::executor::KvCacheStats::reusedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats14tokensPerBlockE", "tensorrt_llm::executor::KvCacheStats::tokensPerBlock"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats13usedNumBlocksE", "tensorrt_llm::executor::KvCacheStats::usedNumBlocks"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor19LogitsPostProcessorE", "tensorrt_llm::executor::LogitsPostProcessor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor26LogitsPostProcessorBatchedE", "tensorrt_llm::executor::LogitsPostProcessorBatched"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfigE", "tensorrt_llm::executor::LogitsPostProcessorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::LogitsPostProcessorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::LogitsPostProcessorConfig::processorBatched"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::LogitsPostProcessorConfig::processorMap"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::LogitsPostProcessorConfig::replicate"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig19getProcessorBatchedEv", "tensorrt_llm::executor::LogitsPostProcessorConfig::getProcessorBatched"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig15getProcessorMapEv", "tensorrt_llm::executor::LogitsPostProcessorConfig::getProcessorMap"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig12getReplicateEv", "tensorrt_llm::executor::LogitsPostProcessorConfig::getReplicate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig17mProcessorBatchedE", "tensorrt_llm::executor::LogitsPostProcessorConfig::mProcessorBatched"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig13mProcessorMapE", "tensorrt_llm::executor::LogitsPostProcessorConfig::mProcessorMap"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig10mReplicateE", "tensorrt_llm::executor::LogitsPostProcessorConfig::mReplicate"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig19setProcessorBatchedERK26LogitsPostProcessorBatched", "tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorBatched"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig19setProcessorBatchedERK26LogitsPostProcessorBatched", "tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorBatched::processorBatched"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig15setProcessorMapERK22LogitsPostProcessorMap", "tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorMap"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig15setProcessorMapERK22LogitsPostProcessorMap", "tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorMap::processorMap"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig12setReplicateEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::setReplicate"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig12setReplicateEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::setReplicate::replicate"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor22LogitsPostProcessorMapE", "tensorrt_llm::executor::LogitsPostProcessorMap"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfigE", "tensorrt_llm::executor::LookaheadDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigEv", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig::ngramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig::verificationSetSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig::windowSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig28calculateSpeculativeResourceEv", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResource"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResourceTuple"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResourceTuple::ngramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResourceTuple::verificationSetSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResourceTuple::windowSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig3getEv", "tensorrt_llm::executor::LookaheadDecodingConfig::get"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig12getNgramSizeEv", "tensorrt_llm::executor::LookaheadDecodingConfig::getNgramSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig22getVerificationSetSizeEv", "tensorrt_llm::executor::LookaheadDecodingConfig::getVerificationSetSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig13getWindowSizeEv", "tensorrt_llm::executor::LookaheadDecodingConfig::getWindowSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig4isLEERK23LookaheadDecodingConfig", "tensorrt_llm::executor::LookaheadDecodingConfig::isLE"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig4isLEERK23LookaheadDecodingConfig", "tensorrt_llm::executor::LookaheadDecodingConfig::isLE::that"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::isLegal"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::isLegal::ngramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::isLegal::verificationSetSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::isLegal::windowSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig30kDefaultLookaheadDecodingNgramE", "tensorrt_llm::executor::LookaheadDecodingConfig::kDefaultLookaheadDecodingNgram"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig40kDefaultLookaheadDecodingVerificationSetE", "tensorrt_llm::executor::LookaheadDecodingConfig::kDefaultLookaheadDecodingVerificationSet"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig31kDefaultLookaheadDecodingWindowE", "tensorrt_llm::executor::LookaheadDecodingConfig::kDefaultLookaheadDecodingWindow"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig10mNgramSizeE", "tensorrt_llm::executor::LookaheadDecodingConfig::mNgramSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig20mVerificationSetSizeE", "tensorrt_llm::executor::LookaheadDecodingConfig::mVerificationSetSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig11mWindowSizeE", "tensorrt_llm::executor::LookaheadDecodingConfig::mWindowSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfigeqERK23LookaheadDecodingConfig", "tensorrt_llm::executor::LookaheadDecodingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfigeqERK23LookaheadDecodingConfig", "tensorrt_llm::executor::LookaheadDecodingConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfigE", "tensorrt_llm::executor::LoraConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", "tensorrt_llm::executor::LoraConfig::LoraConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", "tensorrt_llm::executor::LoraConfig::LoraConfig::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", "tensorrt_llm::executor::LoraConfig::LoraConfig::taskId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", "tensorrt_llm::executor::LoraConfig::LoraConfig::weights"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor10LoraConfig9getConfigEv", "tensorrt_llm::executor::LoraConfig::getConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor10LoraConfig9getTaskIdEv", "tensorrt_llm::executor::LoraConfig::getTaskId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor10LoraConfig10getWeightsEv", "tensorrt_llm::executor::LoraConfig::getWeights"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig7mConfigE", "tensorrt_llm::executor::LoraConfig::mConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig7mTaskIdE", "tensorrt_llm::executor::LoraConfig::mTaskId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig8mWeightsE", "tensorrt_llm::executor::LoraConfig::mWeights"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor13MedusaChoicesE", "tensorrt_llm::executor::MedusaChoices"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor10MemoryTypeE", "tensorrt_llm::executor::MemoryType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType4kCPUE", "tensorrt_llm::executor::MemoryType::kCPU"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType11kCPU_PINNEDE", "tensorrt_llm::executor::MemoryType::kCPU_PINNED"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType15kCPU_PINNEDPOOLE", "tensorrt_llm::executor::MemoryType::kCPU_PINNEDPOOL"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType4kGPUE", "tensorrt_llm::executor::MemoryType::kGPU"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType8kUNKNOWNE", "tensorrt_llm::executor::MemoryType::kUNKNOWN"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType4kUVME", "tensorrt_llm::executor::MemoryType::kUVM"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor16MillisecondsTypeE", "tensorrt_llm::executor::MillisecondsType"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor9ModelTypeE", "tensorrt_llm::executor::ModelType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor9ModelType13kDECODER_ONLYE", "tensorrt_llm::executor::ModelType::kDECODER_ONLY"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor9ModelType16kENCODER_DECODERE", "tensorrt_llm::executor::ModelType::kENCODER_DECODER"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor9ModelType13kENCODER_ONLYE", "tensorrt_llm::executor::ModelType::kENCODER_ONLY"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfigE", "tensorrt_llm::executor::MropeConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig11MropeConfigE6Tensor10SizeType32", "tensorrt_llm::executor::MropeConfig::MropeConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig11MropeConfigE6Tensor10SizeType32", "tensorrt_llm::executor::MropeConfig::MropeConfig::mropePositionDeltas"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig11MropeConfigE6Tensor10SizeType32", "tensorrt_llm::executor::MropeConfig::MropeConfig::mropeRoratySinCos"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11MropeConfig22getMRopePositionDeltasEv", "tensorrt_llm::executor::MropeConfig::getMRopePositionDeltas"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11MropeConfig20getMRopeRotaryCosSinEv", "tensorrt_llm::executor::MropeConfig::getMRopeRotaryCosSin"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig20mMRopePositionDeltasE", "tensorrt_llm::executor::MropeConfig::mMRopePositionDeltas"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig18mMRopeRotaryCosSinE", "tensorrt_llm::executor::MropeConfig::mMRopeRotaryCosSin"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfigE", "tensorrt_llm::executor::OrchestratorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig::isOrchestrator"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig::orchLeaderComm"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig::spawnProcesses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig::workerExecutablePath"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getIsOrchestratorEv", "tensorrt_llm::executor::OrchestratorConfig::getIsOrchestrator"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getOrchLeaderCommEv", "tensorrt_llm::executor::OrchestratorConfig::getOrchLeaderComm"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getSpawnProcessesEv", "tensorrt_llm::executor::OrchestratorConfig::getSpawnProcesses"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig23getWorkerExecutablePathEv", "tensorrt_llm::executor::OrchestratorConfig::getWorkerExecutablePath"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mIsOrchestratorE", "tensorrt_llm::executor::OrchestratorConfig::mIsOrchestrator"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mOrchLeaderCommE", "tensorrt_llm::executor::OrchestratorConfig::mOrchLeaderComm"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mSpawnProcessesE", "tensorrt_llm::executor::OrchestratorConfig::mSpawnProcesses"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig21mWorkerExecutablePathE", "tensorrt_llm::executor::OrchestratorConfig::mWorkerExecutablePath"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setIsOrchestratorEb", "tensorrt_llm::executor::OrchestratorConfig::setIsOrchestrator"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setIsOrchestratorEb", "tensorrt_llm::executor::OrchestratorConfig::setIsOrchestrator::isOrchestrator"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setOrchLeaderCommERKNSt10shared_ptrIN3mpi7MpiCommEEE", "tensorrt_llm::executor::OrchestratorConfig::setOrchLeaderComm"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setOrchLeaderCommERKNSt10shared_ptrIN3mpi7MpiCommEEE", "tensorrt_llm::executor::OrchestratorConfig::setOrchLeaderComm::orchLeaderComm"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setSpawnProcessesEb", "tensorrt_llm::executor::OrchestratorConfig::setSpawnProcesses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setSpawnProcessesEb", "tensorrt_llm::executor::OrchestratorConfig::setSpawnProcesses::spawnProcesses"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig23setWorkerExecutablePathERKNSt6stringE", "tensorrt_llm::executor::OrchestratorConfig::setWorkerExecutablePath"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig23setWorkerExecutablePathERKNSt6stringE", "tensorrt_llm::executor::OrchestratorConfig::setWorkerExecutablePath::workerExecutablePath"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfigE", "tensorrt_llm::executor::OutputConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::additionalModelOutputs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::excludeInputFromOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnContextLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnEncoderOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnGenerationLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnLogProbs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnPerfMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig22additionalModelOutputsE", "tensorrt_llm::executor::OutputConfig::additionalModelOutputs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig22excludeInputFromOutputE", "tensorrt_llm::executor::OutputConfig::excludeInputFromOutput"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig19returnContextLogitsE", "tensorrt_llm::executor::OutputConfig::returnContextLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig19returnEncoderOutputE", "tensorrt_llm::executor::OutputConfig::returnEncoderOutput"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig22returnGenerationLogitsE", "tensorrt_llm::executor::OutputConfig::returnGenerationLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig14returnLogProbsE", "tensorrt_llm::executor::OutputConfig::returnLogProbs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig17returnPerfMetricsE", "tensorrt_llm::executor::OutputConfig::returnPerfMetrics"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfigE", "tensorrt_llm::executor::ParallelConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::commMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::commType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::deviceIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::numNodes"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::orchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::participantIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig20getCommunicationModeEv", "tensorrt_llm::executor::ParallelConfig::getCommunicationMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig20getCommunicationTypeEv", "tensorrt_llm::executor::ParallelConfig::getCommunicationType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig12getDeviceIdsEv", "tensorrt_llm::executor::ParallelConfig::getDeviceIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig11getNumNodesEv", "tensorrt_llm::executor::ParallelConfig::getNumNodes"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig21getOrchestratorConfigEv", "tensorrt_llm::executor::ParallelConfig::getOrchestratorConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig17getParticipantIdsEv", "tensorrt_llm::executor::ParallelConfig::getParticipantIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mCommModeE", "tensorrt_llm::executor::ParallelConfig::mCommMode"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mCommTypeE", "tensorrt_llm::executor::ParallelConfig::mCommType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig10mDeviceIdsE", "tensorrt_llm::executor::ParallelConfig::mDeviceIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mNumNodesE", "tensorrt_llm::executor::ParallelConfig::mNumNodes"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig19mOrchestratorConfigE", "tensorrt_llm::executor::ParallelConfig::mOrchestratorConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig15mParticipantIdsE", "tensorrt_llm::executor::ParallelConfig::mParticipantIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationModeE17CommunicationMode", "tensorrt_llm::executor::ParallelConfig::setCommunicationMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationModeE17CommunicationMode", "tensorrt_llm::executor::ParallelConfig::setCommunicationMode::mode"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationTypeE17CommunicationType", "tensorrt_llm::executor::ParallelConfig::setCommunicationType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationTypeE17CommunicationType", "tensorrt_llm::executor::ParallelConfig::setCommunicationType::type"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig12setDeviceIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::setDeviceIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig12setDeviceIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::setDeviceIds::deviceIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig11setNumNodesE10SizeType32", "tensorrt_llm::executor::ParallelConfig::setNumNodes"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig11setNumNodesE10SizeType32", "tensorrt_llm::executor::ParallelConfig::setNumNodes::numNodes"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig21setOrchestratorConfigERK18OrchestratorConfig", "tensorrt_llm::executor::ParallelConfig::setOrchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig21setOrchestratorConfigERK18OrchestratorConfig", "tensorrt_llm::executor::ParallelConfig::setOrchestratorConfig::orchestratorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig17setParticipantIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::setParticipantIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig17setParticipantIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::setParticipantIds::participantIds"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfigE", "tensorrt_llm::executor::PeftCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::deviceCachePercent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::hostCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::loraPrefetchDir"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::maxAdapterSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::maxPagesPerBlockDevice"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::maxPagesPerBlockHost"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numCopyStreams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numDeviceModuleLayer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numEnsureWorkers"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numHostModuleLayer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numPutWorkers"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::optimalAdapterSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getDeviceCachePercentEv", "tensorrt_llm::executor::PeftCacheConfig::getDeviceCachePercent"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig16getHostCacheSizeEv", "tensorrt_llm::executor::PeftCacheConfig::getHostCacheSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig18getLoraPrefetchDirEv", "tensorrt_llm::executor::PeftCacheConfig::getLoraPrefetchDir"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig17getMaxAdapterSizeEv", "tensorrt_llm::executor::PeftCacheConfig::getMaxAdapterSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig25getMaxPagesPerBlockDeviceEv", "tensorrt_llm::executor::PeftCacheConfig::getMaxPagesPerBlockDevice"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig23getMaxPagesPerBlockHostEv", "tensorrt_llm::executor::PeftCacheConfig::getMaxPagesPerBlockHost"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig17getNumCopyStreamsEv", "tensorrt_llm::executor::PeftCacheConfig::getNumCopyStreams"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig23getNumDeviceModuleLayerEv", "tensorrt_llm::executor::PeftCacheConfig::getNumDeviceModuleLayer"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig19getNumEnsureWorkersEv", "tensorrt_llm::executor::PeftCacheConfig::getNumEnsureWorkers"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getNumHostModuleLayerEv", "tensorrt_llm::executor::PeftCacheConfig::getNumHostModuleLayer"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig16getNumPutWorkersEv", "tensorrt_llm::executor::PeftCacheConfig::getNumPutWorkers"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getOptimalAdapterSizeEv", "tensorrt_llm::executor::PeftCacheConfig::getOptimalAdapterSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig22kDefaultMaxAdapterSizeE", "tensorrt_llm::executor::PeftCacheConfig::kDefaultMaxAdapterSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig30kDefaultMaxPagesPerBlockDeviceE", "tensorrt_llm::executor::PeftCacheConfig::kDefaultMaxPagesPerBlockDevice"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig28kDefaultMaxPagesPerBlockHostE", "tensorrt_llm::executor::PeftCacheConfig::kDefaultMaxPagesPerBlockHost"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig26kDefaultOptimalAdapterSizeE", "tensorrt_llm::executor::PeftCacheConfig::kDefaultOptimalAdapterSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mDeviceCachePercentE", "tensorrt_llm::executor::PeftCacheConfig::mDeviceCachePercent"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig14mHostCacheSizeE", "tensorrt_llm::executor::PeftCacheConfig::mHostCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig16mLoraPrefetchDirE", "tensorrt_llm::executor::PeftCacheConfig::mLoraPrefetchDir"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15mMaxAdapterSizeE", "tensorrt_llm::executor::PeftCacheConfig::mMaxAdapterSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig23mMaxPagesPerBlockDeviceE", "tensorrt_llm::executor::PeftCacheConfig::mMaxPagesPerBlockDevice"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig21mMaxPagesPerBlockHostE", "tensorrt_llm::executor::PeftCacheConfig::mMaxPagesPerBlockHost"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15mNumCopyStreamsE", "tensorrt_llm::executor::PeftCacheConfig::mNumCopyStreams"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig21mNumDeviceModuleLayerE", "tensorrt_llm::executor::PeftCacheConfig::mNumDeviceModuleLayer"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig17mNumEnsureWorkersE", "tensorrt_llm::executor::PeftCacheConfig::mNumEnsureWorkers"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mNumHostModuleLayerE", "tensorrt_llm::executor::PeftCacheConfig::mNumHostModuleLayer"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig14mNumPutWorkersE", "tensorrt_llm::executor::PeftCacheConfig::mNumPutWorkers"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mOptimalAdapterSizeE", "tensorrt_llm::executor::PeftCacheConfig::mOptimalAdapterSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfigeqERK15PeftCacheConfig", "tensorrt_llm::executor::PeftCacheConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfigeqERK15PeftCacheConfig", "tensorrt_llm::executor::PeftCacheConfig::operator==::other"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor12PriorityTypeE", "tensorrt_llm::executor::PriorityType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfigE", "tensorrt_llm::executor::PromptTuningConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig18PromptTuningConfigE6TensorNSt8optionalI16VecTokenExtraIdsEE", "tensorrt_llm::executor::PromptTuningConfig::PromptTuningConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig18PromptTuningConfigE6TensorNSt8optionalI16VecTokenExtraIdsEE", "tensorrt_llm::executor::PromptTuningConfig::PromptTuningConfig::embeddingTable"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig18PromptTuningConfigE6TensorNSt8optionalI16VecTokenExtraIdsEE", "tensorrt_llm::executor::PromptTuningConfig::PromptTuningConfig::inputTokenExtraIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18PromptTuningConfig17getEmbeddingTableEv", "tensorrt_llm::executor::PromptTuningConfig::getEmbeddingTable"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18PromptTuningConfig21getInputTokenExtraIdsEv", "tensorrt_llm::executor::PromptTuningConfig::getInputTokenExtraIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig15mEmbeddingTableE", "tensorrt_llm::executor::PromptTuningConfig::mEmbeddingTable"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig19mInputTokenExtraIdsE", "tensorrt_llm::executor::PromptTuningConfig::mInputTokenExtraIds"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor14RandomSeedTypeE", "tensorrt_llm::executor::RandomSeedType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor7RequestE", "tensorrt_llm::executor::Request"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestERK7Request", "tensorrt_llm::executor::Request::Request"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestERR7Request", "tensorrt_llm::executor::Request::Request"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::allottedTimeMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::badWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::clientId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::contextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::crossAttentionMask"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::eagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::embeddingBias"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::encoderInputFeatures"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::encoderInputTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::encoderOutputLength"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::endId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::externalDraftTokensConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::guidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::inputTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::kvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::languageAdapterUid"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::logitsPostProcessor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::logitsPostProcessorName"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::lookaheadConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::loraConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::mRopeConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::maxTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::multimodalEmbedding"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::numReturnSequences"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestERK7Request", "tensorrt_llm::executor::Request::Request::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestERR7Request", "tensorrt_llm::executor::Request::Request::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::outputConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::pTuningConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::padId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::positionIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::priority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::returnAllGeneratedTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::samplingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::skipCrossAttnBlocks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::stopWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::streaming"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::type"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request24getAdditionalOutputNamesEv", "tensorrt_llm::executor::Request::getAdditionalOutputNames"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request17getAllottedTimeMsEv", "tensorrt_llm::executor::Request::getAllottedTimeMs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request11getBadWordsEv", "tensorrt_llm::executor::Request::getBadWords"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request11getClientIdEv", "tensorrt_llm::executor::Request::getClientId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request21getContextPhaseParamsEv", "tensorrt_llm::executor::Request::getContextPhaseParams"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request21getCrossAttentionMaskEv", "tensorrt_llm::executor::Request::getCrossAttentionMask"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request14getEagleConfigEv", "tensorrt_llm::executor::Request::getEagleConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request16getEmbeddingBiasEv", "tensorrt_llm::executor::Request::getEmbeddingBias"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request23getEncoderInputFeaturesEv", "tensorrt_llm::executor::Request::getEncoderInputFeatures"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request23getEncoderInputTokenIdsEv", "tensorrt_llm::executor::Request::getEncoderInputTokenIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request22getEncoderOutputLengthEv", "tensorrt_llm::executor::Request::getEncoderOutputLength"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request8getEndIdEv", "tensorrt_llm::executor::Request::getEndId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request28getExternalDraftTokensConfigEv", "tensorrt_llm::executor::Request::getExternalDraftTokensConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request23getGuidedDecodingParamsEv", "tensorrt_llm::executor::Request::getGuidedDecodingParams"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request16getInputTokenIdsEv", "tensorrt_llm::executor::Request::getInputTokenIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request25getKvCacheRetentionConfigEv", "tensorrt_llm::executor::Request::getKvCacheRetentionConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request21getLanguageAdapterUidEv", "tensorrt_llm::executor::Request::getLanguageAdapterUid"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request22getLogitsPostProcessorEv", "tensorrt_llm::executor::Request::getLogitsPostProcessor"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request26getLogitsPostProcessorNameEv", "tensorrt_llm::executor::Request::getLogitsPostProcessorName"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request18getLookaheadConfigEv", "tensorrt_llm::executor::Request::getLookaheadConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request13getLoraConfigEv", "tensorrt_llm::executor::Request::getLoraConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request12getMaxTokensEv", "tensorrt_llm::executor::Request::getMaxTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request14getMropeConfigEv", "tensorrt_llm::executor::Request::getMropeConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request22getMultimodalEmbeddingEv", "tensorrt_llm::executor::Request::getMultimodalEmbedding"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request15getOutputConfigEv", "tensorrt_llm::executor::Request::getOutputConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request8getPadIdEv", "tensorrt_llm::executor::Request::getPadId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request14getPositionIdsEv", "tensorrt_llm::executor::Request::getPositionIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request11getPriorityEv", "tensorrt_llm::executor::Request::getPriority"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request21getPromptTuningConfigEv", "tensorrt_llm::executor::Request::getPromptTuningConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request14getRequestTypeEv", "tensorrt_llm::executor::Request::getRequestType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request27getReturnAllGeneratedTokensEv", "tensorrt_llm::executor::Request::getReturnAllGeneratedTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request17getSamplingConfigEv", "tensorrt_llm::executor::Request::getSamplingConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request22getSkipCrossAttnBlocksEv", "tensorrt_llm::executor::Request::getSkipCrossAttnBlocks"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request12getStopWordsEv", "tensorrt_llm::executor::Request::getStopWords"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request12getStreamingEv", "tensorrt_llm::executor::Request::getStreaming"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor7Request25kBatchedPostProcessorNameE", "tensorrt_llm::executor::Request::kBatchedPostProcessorName"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor7Request16kDefaultPriorityE", "tensorrt_llm::executor::Request::kDefaultPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor7Request31kDynamicPostProcessorNamePrefixE", "tensorrt_llm::executor::Request::kDynamicPostProcessorNamePrefix"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor7Request5mImplE", "tensorrt_llm::executor::Request::mImpl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7RequestaSERK7Request", "tensorrt_llm::executor::Request::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7RequestaSERR7Request", "tensorrt_llm::executor::Request::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7RequestaSERK7Request", "tensorrt_llm::executor::Request::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7RequestaSERR7Request", "tensorrt_llm::executor::Request::operator=::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request17setAllottedTimeMsE16MillisecondsType", "tensorrt_llm::executor::Request::setAllottedTimeMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request17setAllottedTimeMsE16MillisecondsType", "tensorrt_llm::executor::Request::setAllottedTimeMs::allottedTimeMs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request11setBadWordsERKNSt4listI9VecTokensEE", "tensorrt_llm::executor::Request::setBadWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request11setBadWordsERKNSt4listI9VecTokensEE", "tensorrt_llm::executor::Request::setBadWords::badWords"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request11setClientIdE6IdType", "tensorrt_llm::executor::Request::setClientId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request11setClientIdE6IdType", "tensorrt_llm::executor::Request::setClientId::clientId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request21setContextPhaseParamsE18ContextPhaseParams", "tensorrt_llm::executor::Request::setContextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request21setContextPhaseParamsE18ContextPhaseParams", "tensorrt_llm::executor::Request::setContextPhaseParams::contextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request21setCrossAttentionMaskE6Tensor", "tensorrt_llm::executor::Request::setCrossAttentionMask"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request21setCrossAttentionMaskE6Tensor", "tensorrt_llm::executor::Request::setCrossAttentionMask::crossAttentionMask"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request14setEagleConfigERKNSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::Request::setEagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request14setEagleConfigERKNSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::Request::setEagleConfig::eagleConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request16setEmbeddingBiasERK6Tensor", "tensorrt_llm::executor::Request::setEmbeddingBias"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request16setEmbeddingBiasERK6Tensor", "tensorrt_llm::executor::Request::setEmbeddingBias::embeddingBias"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputFeaturesE6Tensor", "tensorrt_llm::executor::Request::setEncoderInputFeatures"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputFeaturesE6Tensor", "tensorrt_llm::executor::Request::setEncoderInputFeatures::encoderInputFeatures"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputTokenIdsERK9VecTokens", "tensorrt_llm::executor::Request::setEncoderInputTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputTokenIdsERK9VecTokens", "tensorrt_llm::executor::Request::setEncoderInputTokenIds::encoderInputTokenIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request22setEncoderOutputLengthE10SizeType32", "tensorrt_llm::executor::Request::setEncoderOutputLength"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request22setEncoderOutputLengthE10SizeType32", "tensorrt_llm::executor::Request::setEncoderOutputLength::encoderOutputLength"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request8setEndIdE10SizeType32", "tensorrt_llm::executor::Request::setEndId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request8setEndIdE10SizeType32", "tensorrt_llm::executor::Request::setEndId::endId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request28setExternalDraftTokensConfigERK25ExternalDraftTokensConfig", "tensorrt_llm::executor::Request::setExternalDraftTokensConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request28setExternalDraftTokensConfigERK25ExternalDraftTokensConfig", "tensorrt_llm::executor::Request::setExternalDraftTokensConfig::externalDraftTokensConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request23setGuidedDecodingParamsERK20GuidedDecodingParams", "tensorrt_llm::executor::Request::setGuidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request23setGuidedDecodingParamsERK20GuidedDecodingParams", "tensorrt_llm::executor::Request::setGuidedDecodingParams::guidedDecodingParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request25setKvCacheRetentionConfigERK22KvCacheRetentionConfig", "tensorrt_llm::executor::Request::setKvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request25setKvCacheRetentionConfigERK22KvCacheRetentionConfig", "tensorrt_llm::executor::Request::setKvCacheRetentionConfig::kvCacheRetentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request21setLanguageAdapterUidE10SizeType32", "tensorrt_llm::executor::Request::setLanguageAdapterUid"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request21setLanguageAdapterUidE10SizeType32", "tensorrt_llm::executor::Request::setLanguageAdapterUid::languageAdapterUid"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request22setLogitsPostProcessorERKNSt8optionalI19LogitsPostProcessorEE", "tensorrt_llm::executor::Request::setLogitsPostProcessor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request22setLogitsPostProcessorERKNSt8optionalI19LogitsPostProcessorEE", "tensorrt_llm::executor::Request::setLogitsPostProcessor::logitsPostProcessor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request26setLogitsPostProcessorNameERKNSt6stringE", "tensorrt_llm::executor::Request::setLogitsPostProcessorName"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request26setLogitsPostProcessorNameERKNSt6stringE", "tensorrt_llm::executor::Request::setLogitsPostProcessorName::logitsPostProcessorName"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request18setLookaheadConfigERK23LookaheadDecodingConfig", "tensorrt_llm::executor::Request::setLookaheadConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request18setLookaheadConfigERK23LookaheadDecodingConfig", "tensorrt_llm::executor::Request::setLookaheadConfig::lookaheadConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request13setLoraConfigERK10LoraConfig", "tensorrt_llm::executor::Request::setLoraConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request13setLoraConfigERK10LoraConfig", "tensorrt_llm::executor::Request::setLoraConfig::loraConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request14setMropeConfigERK11MropeConfig", "tensorrt_llm::executor::Request::setMropeConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request14setMropeConfigERK11MropeConfig", "tensorrt_llm::executor::Request::setMropeConfig::mRopeConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request22setMultimodalEmbeddingERK6Tensor", "tensorrt_llm::executor::Request::setMultimodalEmbedding"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request22setMultimodalEmbeddingERK6Tensor", "tensorrt_llm::executor::Request::setMultimodalEmbedding::multimodalEmbedding"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request15setOutputConfigERK12OutputConfig", "tensorrt_llm::executor::Request::setOutputConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request15setOutputConfigERK12OutputConfig", "tensorrt_llm::executor::Request::setOutputConfig::outputConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request8setPadIdE10SizeType32", "tensorrt_llm::executor::Request::setPadId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request8setPadIdE10SizeType32", "tensorrt_llm::executor::Request::setPadId::padId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request14setPositionIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::Request::setPositionIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request14setPositionIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::Request::setPositionIds::positionIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request11setPriorityE12PriorityType", "tensorrt_llm::executor::Request::setPriority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request11setPriorityE12PriorityType", "tensorrt_llm::executor::Request::setPriority::priority"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request21setPromptTuningConfigERK18PromptTuningConfig", "tensorrt_llm::executor::Request::setPromptTuningConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request21setPromptTuningConfigERK18PromptTuningConfig", "tensorrt_llm::executor::Request::setPromptTuningConfig::pTuningConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request14setRequestTypeERK11RequestType", "tensorrt_llm::executor::Request::setRequestType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request14setRequestTypeERK11RequestType", "tensorrt_llm::executor::Request::setRequestType::requestType"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request27setReturnAllGeneratedTokensEb", "tensorrt_llm::executor::Request::setReturnAllGeneratedTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request27setReturnAllGeneratedTokensEb", "tensorrt_llm::executor::Request::setReturnAllGeneratedTokens::returnAllGeneratedTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request17setSamplingConfigERK14SamplingConfig", "tensorrt_llm::executor::Request::setSamplingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request17setSamplingConfigERK14SamplingConfig", "tensorrt_llm::executor::Request::setSamplingConfig::config"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request22setSkipCrossAttnBlocksE6Tensor", "tensorrt_llm::executor::Request::setSkipCrossAttnBlocks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request22setSkipCrossAttnBlocksE6Tensor", "tensorrt_llm::executor::Request::setSkipCrossAttnBlocks::skipCrossAttnBlocks"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request12setStopWordsERKNSt4listI9VecTokensEE", "tensorrt_llm::executor::Request::setStopWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request12setStopWordsERKNSt4listI9VecTokensEE", "tensorrt_llm::executor::Request::setStopWords::stopWords"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request12setStreamingEb", "tensorrt_llm::executor::Request::setStreaming"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request12setStreamingEb", "tensorrt_llm::executor::Request::setStreaming::streaming"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7RequestD0Ev", "tensorrt_llm::executor::Request::~Request"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetricsE", "tensorrt_llm::executor::RequestPerfMetrics"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics14kvCacheHitRateE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::kvCacheHitRate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics15numMissedBlocksE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::numMissedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics21numNewAllocatedBlocksE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::numNewAllocatedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics15numReusedBlocksE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::numReusedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics23numTotalAllocatedBlocksE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::numTotalAllocatedBlocks"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::SpeculativeDecodingMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics14acceptanceRateE", "tensorrt_llm::executor::RequestPerfMetrics::SpeculativeDecodingMetrics::acceptanceRate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics24totalAcceptedDraftTokensE", "tensorrt_llm::executor::RequestPerfMetrics::SpeculativeDecodingMetrics::totalAcceptedDraftTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics16totalDraftTokensE", "tensorrt_llm::executor::RequestPerfMetrics::SpeculativeDecodingMetrics::totalDraftTokens"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics9TimePointE", "tensorrt_llm::executor::RequestPerfMetrics::TimePoint"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics11arrivalTimeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::arrivalTime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics18firstScheduledTimeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::firstScheduledTime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics14firstTokenTimeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::firstTokenTime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics11kvCacheSizeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::kvCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics18kvCacheTransferEndE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::kvCacheTransferEnd"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics20kvCacheTransferStartE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::kvCacheTransferStart"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics13lastTokenTimeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::lastTokenTime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics9firstIterE", "tensorrt_llm::executor::RequestPerfMetrics::firstIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics4iterE", "tensorrt_llm::executor::RequestPerfMetrics::iter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14kvCacheMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::kvCacheMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics8lastIterE", "tensorrt_llm::executor::RequestPerfMetrics::lastIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics19speculativeDecodingE", "tensorrt_llm::executor::RequestPerfMetrics::speculativeDecoding"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13timingMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::timingMetrics"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor12RequestStageE", "tensorrt_llm::executor::RequestStage"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage20kCONTEXT_IN_PROGRESSE", "tensorrt_llm::executor::RequestStage::kCONTEXT_IN_PROGRESS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage20kENCODER_IN_PROGRESSE", "tensorrt_llm::executor::RequestStage::kENCODER_IN_PROGRESS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage20kGENERATION_COMPLETEE", "tensorrt_llm::executor::RequestStage::kGENERATION_COMPLETE"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage23kGENERATION_IN_PROGRESSE", "tensorrt_llm::executor::RequestStage::kGENERATION_IN_PROGRESS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage7kQUEUEDE", "tensorrt_llm::executor::RequestStage::kQUEUED"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12RequestStatsE", "tensorrt_llm::executor::RequestStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats24allocNewBlocksPerRequestE", "tensorrt_llm::executor::RequestStats::allocNewBlocksPerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats26allocTotalBlocksPerRequestE", "tensorrt_llm::executor::RequestStats::allocTotalBlocksPerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats26avgNumDecodedTokensPerIterE", "tensorrt_llm::executor::RequestStats::avgNumDecodedTokensPerIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats22contextPrefillPositionE", "tensorrt_llm::executor::RequestStats::contextPrefillPosition"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats15disServingStatsE", "tensorrt_llm::executor::RequestStats::disServingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats2idE", "tensorrt_llm::executor::RequestStats::id"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats24kvCacheHitRatePerRequestE", "tensorrt_llm::executor::RequestStats::kvCacheHitRatePerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats22missedBlocksPerRequestE", "tensorrt_llm::executor::RequestStats::missedBlocksPerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats18numGeneratedTokensE", "tensorrt_llm::executor::RequestStats::numGeneratedTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats6pausedE", "tensorrt_llm::executor::RequestStats::paused"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats22reusedBlocksPerRequestE", "tensorrt_llm::executor::RequestStats::reusedBlocksPerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats9scheduledE", "tensorrt_llm::executor::RequestStats::scheduled"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats5stageE", "tensorrt_llm::executor::RequestStats::stage"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIterationE", "tensorrt_llm::executor::RequestStatsPerIteration"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIteration4iterE", "tensorrt_llm::executor::RequestStatsPerIteration::iter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIteration12requestStatsE", "tensorrt_llm::executor::RequestStatsPerIteration::requestStats"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor11RequestTypeE", "tensorrt_llm::executor::RequestType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor11RequestType35REQUEST_TYPE_CONTEXT_AND_GENERATIONE", "tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor11RequestType25REQUEST_TYPE_CONTEXT_ONLYE", "tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_ONLY"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor11RequestType28REQUEST_TYPE_GENERATION_ONLYE", "tensorrt_llm::executor::RequestType::REQUEST_TYPE_GENERATION_ONLY"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8ResponseE", "tensorrt_llm::executor::Response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERK8Response", "tensorrt_llm::executor::Response::Response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERR8Response", "tensorrt_llm::executor::Response::Response"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::Result"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::clientId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::clientId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::errorMsg"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERK8Response", "tensorrt_llm::executor::Response::Response::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERR8Response", "tensorrt_llm::executor::Response::Response::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::requestId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::requestId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response11getClientIdEv", "tensorrt_llm::executor::Response::getClientId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response11getErrorMsgEv", "tensorrt_llm::executor::Response::getErrorMsg"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response12getRequestIdEv", "tensorrt_llm::executor::Response::getRequestId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response9getResultEv", "tensorrt_llm::executor::Response::getResult"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response8hasErrorEv", "tensorrt_llm::executor::Response::hasError"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8Response5mImplE", "tensorrt_llm::executor::Response::mImpl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ResponseaSERK8Response", "tensorrt_llm::executor::Response::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ResponseaSERR8Response", "tensorrt_llm::executor::Response::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8ResponseaSERK8Response", "tensorrt_llm::executor::Response::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8ResponseaSERR8Response", "tensorrt_llm::executor::Response::operator=::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ResponseD0Ev", "tensorrt_llm::executor::Response::~Response"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor6ResultE", "tensorrt_llm::executor::Result"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result17additionalOutputsE", "tensorrt_llm::executor::Result::additionalOutputs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result13contextLogitsE", "tensorrt_llm::executor::Result::contextLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result18contextPhaseParamsE", "tensorrt_llm::executor::Result::contextPhaseParams"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result11cumLogProbsE", "tensorrt_llm::executor::Result::cumLogProbs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result12decodingIterE", "tensorrt_llm::executor::Result::decodingIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result13encoderOutputE", "tensorrt_llm::executor::Result::encoderOutput"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result13finishReasonsE", "tensorrt_llm::executor::Result::finishReasons"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result16generationLogitsE", "tensorrt_llm::executor::Result::generationLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result7isFinalE", "tensorrt_llm::executor::Result::isFinal"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result15isSequenceFinalE", "tensorrt_llm::executor::Result::isSequenceFinal"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result8logProbsE", "tensorrt_llm::executor::Result::logProbs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result14outputTokenIdsE", "tensorrt_llm::executor::Result::outputTokenIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result18requestPerfMetricsE", "tensorrt_llm::executor::Result::requestPerfMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result13sequenceIndexE", "tensorrt_llm::executor::Result::sequenceIndex"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result21specDecFastLogitsInfoE", "tensorrt_llm::executor::Result::specDecFastLogitsInfo"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor17RetentionPriorityE", "tensorrt_llm::executor::RetentionPriority"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDurationE", "tensorrt_llm::executor::RetentionPriorityAndDuration"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration28RetentionPriorityAndDurationERKNSt8optionalI17RetentionPriorityEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::RetentionPriorityAndDuration::RetentionPriorityAndDuration"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration28RetentionPriorityAndDurationERKNSt8optionalI17RetentionPriorityEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::RetentionPriorityAndDuration::RetentionPriorityAndDuration::durationMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration28RetentionPriorityAndDurationERKNSt8optionalI17RetentionPriorityEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::RetentionPriorityAndDuration::RetentionPriorityAndDuration::retentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration10durationMsE", "tensorrt_llm::executor::RetentionPriorityAndDuration::durationMs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration17retentionPriorityE", "tensorrt_llm::executor::RetentionPriorityAndDuration::retentionPriority"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfigE", "tensorrt_llm::executor::SamplingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::beamSearchDiversityRate"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::beamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::beamWidthArray"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::earlyStopping"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::frequencyPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::lengthPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::minP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::minTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::noRepeatNgramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::numReturnSequences"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::presencePenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::repetitionPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::seed"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::temperature"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topK"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topPDecay"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topPMin"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topPResetIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig28checkBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkBeamSearchDiversityRate"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig28checkBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkBeamSearchDiversityRate::beamSearchDiversityRate"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkBeamWidthE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkBeamWidthE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidth::beamWidth"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19checkBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEEK10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidthArray"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19checkBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEEK10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidthArray::beamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19checkBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEEK10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidthArray::beamWidthArray"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkEarlyStoppingERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkEarlyStopping"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkEarlyStoppingERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkEarlyStopping::earlyStopping"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkLengthPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkLengthPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkLengthPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkLengthPenalty::lengthPenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkMinPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkMinP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkMinPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkMinP::minP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkMinTokensERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkMinTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkMinTokensERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkMinTokens::minTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkNoRepeatNgramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkNoRepeatNgramSize::noRepeatNgramSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig23checkNumReturnSequencesERKNSt8optionalI10SizeType32EE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkNumReturnSequences"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig23checkNumReturnSequencesERKNSt8optionalI10SizeType32EE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkNumReturnSequences::beamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig23checkNumReturnSequencesERKNSt8optionalI10SizeType32EE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkNumReturnSequences::numReturnSequences"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkRepetitionPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkRepetitionPenalty::repetitionpenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16checkTemperatureERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTemperature"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16checkTemperatureERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTemperature::temperature"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopKERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopK"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopKERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopK::topK"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopP::topP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkTopPDecayERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPDecay"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkTopPDecayERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPDecay::topPDecay"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12checkTopPMinERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPMin"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12checkTopPMinERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPMin::topPMin"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17checkTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPResetIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17checkTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPResetIds::topPResetIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig26getBeamSearchDiversityRateEv", "tensorrt_llm::executor::SamplingConfig::getBeamSearchDiversityRate"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getBeamWidthEv", "tensorrt_llm::executor::SamplingConfig::getBeamWidth"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig17getBeamWidthArrayEv", "tensorrt_llm::executor::SamplingConfig::getBeamWidthArray"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig16getEarlyStoppingEv", "tensorrt_llm::executor::SamplingConfig::getEarlyStopping"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig19getFrequencyPenaltyEv", "tensorrt_llm::executor::SamplingConfig::getFrequencyPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig16getLengthPenaltyEv", "tensorrt_llm::executor::SamplingConfig::getLengthPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getMinPEv", "tensorrt_llm::executor::SamplingConfig::getMinP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getMinTokensEv", "tensorrt_llm::executor::SamplingConfig::getMinTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig20getNoRepeatNgramSizeEv", "tensorrt_llm::executor::SamplingConfig::getNoRepeatNgramSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig17getNumReturnBeamsEv", "tensorrt_llm::executor::SamplingConfig::getNumReturnBeams"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig21getNumReturnSequencesEv", "tensorrt_llm::executor::SamplingConfig::getNumReturnSequences"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig18getPresencePenaltyEv", "tensorrt_llm::executor::SamplingConfig::getPresencePenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig20getRepetitionPenaltyEv", "tensorrt_llm::executor::SamplingConfig::getRepetitionPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getSeedEv", "tensorrt_llm::executor::SamplingConfig::getSeed"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig14getTemperatureEv", "tensorrt_llm::executor::SamplingConfig::getTemperature"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getTopKEv", "tensorrt_llm::executor::SamplingConfig::getTopK"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getTopPEv", "tensorrt_llm::executor::SamplingConfig::getTopP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getTopPDecayEv", "tensorrt_llm::executor::SamplingConfig::getTopPDecay"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig10getTopPMinEv", "tensorrt_llm::executor::SamplingConfig::getTopPMin"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig15getTopPResetIdsEv", "tensorrt_llm::executor::SamplingConfig::getTopPResetIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig24mBeamSearchDiversityRateE", "tensorrt_llm::executor::SamplingConfig::mBeamSearchDiversityRate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mBeamWidthE", "tensorrt_llm::executor::SamplingConfig::mBeamWidth"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15mBeamWidthArrayE", "tensorrt_llm::executor::SamplingConfig::mBeamWidthArray"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14mEarlyStoppingE", "tensorrt_llm::executor::SamplingConfig::mEarlyStopping"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17mFrequencyPenaltyE", "tensorrt_llm::executor::SamplingConfig::mFrequencyPenalty"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14mLengthPenaltyE", "tensorrt_llm::executor::SamplingConfig::mLengthPenalty"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mMinPE", "tensorrt_llm::executor::SamplingConfig::mMinP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mMinTokensE", "tensorrt_llm::executor::SamplingConfig::mMinTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18mNoRepeatNgramSizeE", "tensorrt_llm::executor::SamplingConfig::mNoRepeatNgramSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15mNumReturnBeamsE", "tensorrt_llm::executor::SamplingConfig::mNumReturnBeams"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19mNumReturnSequencesE", "tensorrt_llm::executor::SamplingConfig::mNumReturnSequences"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16mPresencePenaltyE", "tensorrt_llm::executor::SamplingConfig::mPresencePenalty"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18mRepetitionPenaltyE", "tensorrt_llm::executor::SamplingConfig::mRepetitionPenalty"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mSeedE", "tensorrt_llm::executor::SamplingConfig::mSeed"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12mTemperatureE", "tensorrt_llm::executor::SamplingConfig::mTemperature"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mTopKE", "tensorrt_llm::executor::SamplingConfig::mTopK"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mTopPE", "tensorrt_llm::executor::SamplingConfig::mTopP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mTopPDecayE", "tensorrt_llm::executor::SamplingConfig::mTopPDecay"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig8mTopPMinE", "tensorrt_llm::executor::SamplingConfig::mTopPMin"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig13mTopPResetIdsE", "tensorrt_llm::executor::SamplingConfig::mTopPResetIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfigeqERK14SamplingConfig", "tensorrt_llm::executor::SamplingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfigeqERK14SamplingConfig", "tensorrt_llm::executor::SamplingConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig26setBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setBeamSearchDiversityRate"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig26setBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setBeamSearchDiversityRate::beamSearchDiversityRate"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setBeamWidthE10SizeType32", "tensorrt_llm::executor::SamplingConfig::setBeamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setBeamWidthE10SizeType32", "tensorrt_llm::executor::SamplingConfig::setBeamWidth::beamWidth"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17setBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::setBeamWidthArray"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17setBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::setBeamWidthArray::beamWidthArray"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setEarlyStoppingERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setEarlyStopping"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setEarlyStoppingERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setEarlyStopping::earlyStopping"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19setFrequencyPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setFrequencyPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19setFrequencyPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setFrequencyPenalty::frequencyPenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setLengthPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setLengthPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setLengthPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setLengthPenalty::lengthPenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setMinPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setMinP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setMinPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setMinP::minP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setMinTokensERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setMinTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setMinTokensERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setMinTokens::minTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setNoRepeatNgramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setNoRepeatNgramSize::noRepeatNgramSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig21setNumReturnSequencesERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setNumReturnSequences"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig21setNumReturnSequencesERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setNumReturnSequences::numReturnSequences"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18setPresencePenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setPresencePenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18setPresencePenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setPresencePenalty::presencePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setRepetitionPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setRepetitionPenalty::repetitionPenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setSeedERKNSt8optionalI14RandomSeedTypeEE", "tensorrt_llm::executor::SamplingConfig::setSeed"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setSeedERKNSt8optionalI14RandomSeedTypeEE", "tensorrt_llm::executor::SamplingConfig::setSeed::seed"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14setTemperatureERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTemperature"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14setTemperatureERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTemperature::temperature"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopKERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setTopK"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopKERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setTopK::topK"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopP::topP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setTopPDecayERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPDecay"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setTopPDecayERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPDecay::topPDecay"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10setTopPMinERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPMin"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10setTopPMinERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPMin::topPMin"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15setTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPResetIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15setTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPResetIds::topPResetIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20updateNumReturnBeamsEv", "tensorrt_llm::executor::SamplingConfig::updateNumReturnBeams"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfigE", "tensorrt_llm::executor::SchedulerConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", "tensorrt_llm::executor::SchedulerConfig::SchedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", "tensorrt_llm::executor::SchedulerConfig::SchedulerConfig::capacitySchedulerPolicy"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", "tensorrt_llm::executor::SchedulerConfig::SchedulerConfig::contextChunkingPolicy"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", "tensorrt_llm::executor::SchedulerConfig::SchedulerConfig::dynamicBatchConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig26getCapacitySchedulerPolicyEv", "tensorrt_llm::executor::SchedulerConfig::getCapacitySchedulerPolicy"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig24getContextChunkingPolicyEv", "tensorrt_llm::executor::SchedulerConfig::getContextChunkingPolicy"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig21getDynamicBatchConfigEv", "tensorrt_llm::executor::SchedulerConfig::getDynamicBatchConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig24mCapacitySchedulerPolicyE", "tensorrt_llm::executor::SchedulerConfig::mCapacitySchedulerPolicy"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig22mContextChunkingPolicyE", "tensorrt_llm::executor::SchedulerConfig::mContextChunkingPolicy"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig19mDynamicBatchConfigE", "tensorrt_llm::executor::SchedulerConfig::mDynamicBatchConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfigeqERK15SchedulerConfig", "tensorrt_llm::executor::SchedulerConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfigeqERK15SchedulerConfig", "tensorrt_llm::executor::SchedulerConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor13SerializationE", "tensorrt_llm::executor::Serialization"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeAdditionalModelOutputERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAdditionalModelOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeAdditionalModelOutputERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAdditionalModelOutput::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization27deserializeAdditionalOutputERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAdditionalOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization27deserializeAdditionalOutputERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAdditionalOutput::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization15deserializeBoolERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeBool"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization15deserializeBoolERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeBool::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCacheState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCacheState::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeCacheTransceiverConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeCacheTransceiverConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCacheTransceiverConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeCommStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCommState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeCommStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCommState::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeContextPhaseParamsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeContextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeContextPhaseParamsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeContextPhaseParams::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeDataTransceiverState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeDataTransceiverState::buffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDataTransceiverState::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeDebugConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDebugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeDebugConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDebugConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDecodingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeDecodingModeERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDecodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeDecodingModeERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDecodingMode::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeDisServingRequestStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDisServingRequestStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeDisServingRequestStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDisServingRequestStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeDynamicBatchConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDynamicBatchConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeDynamicBatchConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDynamicBatchConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeEagleConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeEagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeEagleConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeEagleConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeExecutorConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExecutorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeExecutorConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExecutorConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization40deserializeExtendedRuntimePerfKnobConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExtendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization40deserializeExtendedRuntimePerfKnobConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExtendedRuntimePerfKnobConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeExternalDraftTokensConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExternalDraftTokensConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeExternalDraftTokensConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExternalDraftTokensConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeGuidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeGuidedDecodingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingParamsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeGuidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingParamsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeGuidedDecodingParams::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeInflightBatchingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeInflightBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeInflightBatchingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeInflightBatchingStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeIterationStats"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeIterationStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeIterationStats::buffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeIterationStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization28deserializeIterationStatsVecERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeIterationStatsVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization28deserializeIterationStatsVecERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeIterationStatsVec::buffer"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization24deserializeKvCacheConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization24deserializeKvCacheConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeKvCacheRetentionConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeKvCacheRetentionConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheRetentionConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeKvCacheStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeKvCacheStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization34deserializeLookaheadDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeLookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization34deserializeLookaheadDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeLookaheadDecodingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeLoraConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeLoraConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeLoraConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeLoraConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeModelTypeERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeModelType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeModelTypeERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeModelType::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeMropeConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeMropeConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeMropeConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeMropeConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeOrchestratorConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeOrchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeOrchestratorConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeOrchestratorConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeOutputConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeOutputConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeOutputConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeOutputConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeParallelConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeParallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeParallelConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeParallelConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializePeftCacheConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializePeftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializePeftCacheConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializePeftCacheConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializePromptTuningConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializePromptTuningConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializePromptTuningConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializePromptTuningConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization18deserializeRequestERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequest"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization18deserializeRequestERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequest::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeRequestPerfMetricsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestPerfMetrics"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeRequestPerfMetricsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestPerfMetrics::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStageERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStage"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStageERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStage::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIteration"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIteration"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIteration::buffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIteration::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization38deserializeRequestStatsPerIterationVecERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIterationVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization38deserializeRequestStatsPerIterationVecERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIterationVec::buffer"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization19deserializeResponseERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeResponse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization19deserializeResponseERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeResponse::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeResponsesERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeResponses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeResponsesERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeResponses::buffer"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeResultERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeResult"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeResultERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeResult::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeSamplingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSamplingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeSamplingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSamplingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializeSchedulerConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSchedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializeSchedulerConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSchedulerConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeSocketStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSocketState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeSocketStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSocketState::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeSpecDecFastLogitsInfoERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpecDecFastLogitsInfo"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeSpecDecFastLogitsInfoERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpecDecFastLogitsInfo::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeSpeculativeDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpeculativeDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeSpeculativeDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpeculativeDecodingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization30deserializeStaticBatchingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeStaticBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization30deserializeStaticBatchingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeStaticBatchingStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeStringERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeString"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeStringERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeString::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeTensorERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeTensorERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTensor::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeTimePointERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTimePoint"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeTimePointERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTimePoint::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeTokenRangeRetentionConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTokenRangeRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeTokenRangeRetentionConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTokenRangeRetentionConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStats", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverState", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIteration", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI14IterationStatsEE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI24RequestStatsPerIterationEE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI8ResponseEE", "tensorrt_llm::executor::Serialization::serialize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::additionalModelOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::additionalOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::cacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::contextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverState", "tensorrt_llm::executor::Serialization::serialize::dataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::dataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::debugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::decodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::decodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::dynamicBatchConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::eagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::extendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::guidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::guidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::inflightBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::info"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStats", "tensorrt_llm::executor::Serialization::serialize::iterStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::iterStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI14IterationStatsEE", "tensorrt_llm::executor::Serialization::serialize::iterStatsVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::kvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::kvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::kvCacheStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::lookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::metrics"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::orchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::parallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::peftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::request"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::requestStage"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI24RequestStatsPerIterationEE", "tensorrt_llm::executor::Serialization::serialize::requestStatsVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::response"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI8ResponseEE", "tensorrt_llm::executor::Serialization::serialize::responses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::result"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::schedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::specDecConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIteration", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::staticBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::stats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::tensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::tokenRangeRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::tp"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK10LoraConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11DebugConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11EagleConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11MropeConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12DecodingMode", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12KvCacheStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12OutputConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStage", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK13KvCacheConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14DecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ExecutorConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14IterationStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ParallelConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14SamplingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15PeftCacheConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15SchedulerConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK16AdditionalOutput", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18ContextPhaseParams", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18DynamicBatchConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18OrchestratorConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18PromptTuningConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18RequestPerfMetrics", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK19StaticBatchingStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20DataTransceiverState", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingParams", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21AdditionalModelOutput", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22CacheTransceiverConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22DisServingRequestStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22KvCacheRetentionConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK23LookaheadDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK24RequestStatsPerIteration", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25ExternalDraftTokensConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK33SpeculativeDecodingFastLogitsInfo", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Result", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Tensor", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK7Request", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK8Response", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN18RequestPerfMetrics9TimePointE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10CacheStateE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache9CommStateE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21AdditionalModelOutput", "tensorrt_llm::executor::Serialization::serializedSize::additionalModelOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK16AdditionalOutput", "tensorrt_llm::executor::Serialization::serializedSize::additionalOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22CacheTransceiverConfig", "tensorrt_llm::executor::Serialization::serializedSize::cacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK10LoraConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11MropeConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12OutputConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14SamplingConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18PromptTuningConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25ExternalDraftTokensConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18ContextPhaseParams", "tensorrt_llm::executor::Serialization::serializedSize::contextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20DataTransceiverState", "tensorrt_llm::executor::Serialization::serializedSize::dataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11DebugConfig", "tensorrt_llm::executor::Serialization::serializedSize::debugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14DecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize::decodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12DecodingMode", "tensorrt_llm::executor::Serialization::serializedSize::decodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22DisServingRequestStats", "tensorrt_llm::executor::Serialization::serializedSize::disServingRequestStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18DynamicBatchConfig", "tensorrt_llm::executor::Serialization::serializedSize::dynamicBatchConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11EagleConfig", "tensorrt_llm::executor::Serialization::serializedSize::eagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ExecutorConfig", "tensorrt_llm::executor::Serialization::serializedSize::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::Serialization::serializedSize::extendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize::guidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingParams", "tensorrt_llm::executor::Serialization::serializedSize::guidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats", "tensorrt_llm::executor::Serialization::serializedSize::inflightBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK33SpeculativeDecodingFastLogitsInfo", "tensorrt_llm::executor::Serialization::serializedSize::info"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14IterationStats", "tensorrt_llm::executor::Serialization::serializedSize::iterStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK13KvCacheConfig", "tensorrt_llm::executor::Serialization::serializedSize::kvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22KvCacheRetentionConfig", "tensorrt_llm::executor::Serialization::serializedSize::kvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12KvCacheStats", "tensorrt_llm::executor::Serialization::serializedSize::kvCacheStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK23LookaheadDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize::lookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18RequestPerfMetrics", "tensorrt_llm::executor::Serialization::serializedSize::metrics"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18OrchestratorConfig", "tensorrt_llm::executor::Serialization::serializedSize::orchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ParallelConfig", "tensorrt_llm::executor::Serialization::serializedSize::parallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15PeftCacheConfig", "tensorrt_llm::executor::Serialization::serializedSize::peftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK7Request", "tensorrt_llm::executor::Serialization::serializedSize::request"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStage", "tensorrt_llm::executor::Serialization::serializedSize::requestStage"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK8Response", "tensorrt_llm::executor::Serialization::serializedSize::response"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Result", "tensorrt_llm::executor::Serialization::serializedSize::result"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15SchedulerConfig", "tensorrt_llm::executor::Serialization::serializedSize::schedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize::specDecConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStats", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK24RequestStatsPerIteration", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10CacheStateE", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache9CommStateE", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK19StaticBatchingStats", "tensorrt_llm::executor::Serialization::serializedSize::staticBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Tensor", "tensorrt_llm::executor::Serialization::serializedSize::tensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigE", "tensorrt_llm::executor::Serialization::serializedSize::tokenRangeRetentionConfig"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor5ShapeE", "tensorrt_llm::executor::Shape"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor5Shape4BaseE", "tensorrt_llm::executor::Shape::Base"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor5Shape9DimType64E", "tensorrt_llm::executor::Shape::DimType64"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeENSt16initializer_listI9DimType64EE", "tensorrt_llm::executor::Shape::Shape"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEPK9DimType64N4Base9size_typeE", "tensorrt_llm::executor::Shape::Shape"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEv", "tensorrt_llm::executor::Shape::Shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEPK9DimType64N4Base9size_typeE", "tensorrt_llm::executor::Shape::Shape::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeENSt16initializer_listI9DimType64EE", "tensorrt_llm::executor::Shape::Shape::dims"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEPK9DimType64N4Base9size_typeE", "tensorrt_llm::executor::Shape::Shape::size"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor10SizeType32E", "tensorrt_llm::executor::SizeType32"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfigE", "tensorrt_llm::executor::SpeculativeDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig25SpeculativeDecodingConfigEb", "tensorrt_llm::executor::SpeculativeDecodingConfig::SpeculativeDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig25SpeculativeDecodingConfigEb", "tensorrt_llm::executor::SpeculativeDecodingConfig::SpeculativeDecodingConfig::fastLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig10fastLogitsE", "tensorrt_llm::executor::SpeculativeDecodingConfig::fastLogits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25SpeculativeDecodingConfigeqERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::SpeculativeDecodingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor25SpeculativeDecodingConfigeqERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::SpeculativeDecodingConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfoE", "tensorrt_llm::executor::SpeculativeDecodingFastLogitsInfo"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo18draftParticipantIdE", "tensorrt_llm::executor::SpeculativeDecodingFastLogitsInfo::draftParticipantId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo14draftRequestIdE", "tensorrt_llm::executor::SpeculativeDecodingFastLogitsInfo::draftRequestId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo8toTensorEv", "tensorrt_llm::executor::SpeculativeDecodingFastLogitsInfo::toTensor"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStatsE", "tensorrt_llm::executor::StaticBatchingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats13emptyGenSlotsE", "tensorrt_llm::executor::StaticBatchingStats::emptyGenSlots"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats18numContextRequestsE", "tensorrt_llm::executor::StaticBatchingStats::numContextRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats12numCtxTokensE", "tensorrt_llm::executor::StaticBatchingStats::numCtxTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats12numGenTokensE", "tensorrt_llm::executor::StaticBatchingStats::numGenTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats20numScheduledRequestsE", "tensorrt_llm::executor::StaticBatchingStats::numScheduledRequests"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor9StreamPtrE", "tensorrt_llm::executor::StreamPtr"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor6TensorE", "tensorrt_llm::executor::Tensor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::CudaStreamPtr"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6Tensor4ImplE", "tensorrt_llm::executor::Tensor::Impl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::Tensor::Tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERK6Tensor", "tensorrt_llm::executor::Tensor::Tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERR6Tensor", "tensorrt_llm::executor::Tensor::Tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorEv", "tensorrt_llm::executor::Tensor::Tensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERK6Tensor", "tensorrt_llm::executor::Tensor::Tensor::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERR6Tensor", "tensorrt_llm::executor::Tensor::Tensor::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::Tensor::Tensor::tensor"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor6copyToENSt10shared_ptrI4ImplEE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::copyTo"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor6copyToENSt10shared_ptrI4ImplEE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::copyTo::stream"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor6copyToENSt10shared_ptrI4ImplEE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::copyTo::tensor"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToCpuEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToCpu"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToCpuEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToCpu::stream"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToGpuEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToGpu"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToGpuEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToGpu::stream"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor13copyToManagedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToManaged"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor13copyToManagedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToManaged::stream"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor12copyToPinnedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToPinned"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor12copyToPinnedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToPinned::stream"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor18copyToPooledPinnedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToPooledPinned"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor18copyToPooledPinnedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToPooledPinned::stream"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3cpuE6Tensor5Shape", "tensorrt_llm::executor::Tensor::cpu"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3cpuE8DataType5Shape", "tensorrt_llm::executor::Tensor::cpu"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3cpuE6Tensor5Shape", "tensorrt_llm::executor::Tensor::cpu::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3cpuE8DataType5Shape", "tensorrt_llm::executor::Tensor::cpu::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3cpuE6Tensor5Shape", "tensorrt_llm::executor::Tensor::cpu::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3cpuE8DataType5Shape", "tensorrt_llm::executor::Tensor::cpu::shape"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::Tensor::detail::ofITensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::Tensor::detail::ofITensor::tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9toITensorERK6Tensor", "tensorrt_llm::executor::Tensor::detail::toITensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9toITensorERK6Tensor", "tensorrt_llm::executor::Tensor::detail::toITensor::tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7getDataEv", "tensorrt_llm::executor::Tensor::getData"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor7getDataEv", "tensorrt_llm::executor::Tensor::getData"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor11getDataTypeEv", "tensorrt_llm::executor::Tensor::getDataType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor13getMemoryTypeEv", "tensorrt_llm::executor::Tensor::getMemoryType"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor14getRuntimeTypeE8DataTypev", "tensorrt_llm::executor::Tensor::getRuntimeType"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor14getRuntimeTypeE8DataTypev", "tensorrt_llm::executor::Tensor::getRuntimeType::T"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor8getShapeEv", "tensorrt_llm::executor::Tensor::getShape"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor7getSizeEv", "tensorrt_llm::executor::Tensor::getSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor14getSizeInBytesEv", "tensorrt_llm::executor::Tensor::getSizeInBytes"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::shape"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::stream"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::stream"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7mTensorE", "tensorrt_llm::executor::Tensor::mTensor"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor7managedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::managed"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7managedE8DataType5Shape", "tensorrt_llm::executor::Tensor::managed"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor7managedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::managed::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7managedE8DataType5Shape", "tensorrt_llm::executor::Tensor::managed::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor7managedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::managed::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7managedE8DataType5Shape", "tensorrt_llm::executor::Tensor::managed::shape"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", "tensorrt_llm::executor::Tensor::of"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorR1T", "tensorrt_llm::executor::Tensor::of"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", "tensorrt_llm::executor::Tensor::of"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", "tensorrt_llm::executor::Tensor::of::T"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorR1T", "tensorrt_llm::executor::Tensor::of::T"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", "tensorrt_llm::executor::Tensor::of::data"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorR1T", "tensorrt_llm::executor::Tensor::of::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", "tensorrt_llm::executor::Tensor::of::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", "tensorrt_llm::executor::Tensor::of::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", "tensorrt_llm::executor::Tensor::of::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", "tensorrt_llm::executor::Tensor::of::shape"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6TensorcvbEv", "tensorrt_llm::executor::Tensor::operator bool"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6TensorneERK6Tensor", "tensorrt_llm::executor::Tensor::operator!="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6TensorneERK6Tensor", "tensorrt_llm::executor::Tensor::operator!=::rhs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6TensoraSERK6Tensor", "tensorrt_llm::executor::Tensor::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6TensoraSERR6Tensor", "tensorrt_llm::executor::Tensor::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6TensoraSERK6Tensor", "tensorrt_llm::executor::Tensor::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6TensoraSERR6Tensor", "tensorrt_llm::executor::Tensor::operator=::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6TensoreqERK6Tensor", "tensorrt_llm::executor::Tensor::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6TensoreqERK6Tensor", "tensorrt_llm::executor::Tensor::operator==::rhs"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor6pinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pinned"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6pinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pinned"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor6pinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pinned::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6pinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pinned::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor6pinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pinned::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6pinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pinned::shape"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor12pooledPinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pooledPinned"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor12pooledPinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pooledPinned"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor12pooledPinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pooledPinned::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor12pooledPinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pooledPinned::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor12pooledPinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pooledPinned::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor12pooledPinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pooledPinned::shape"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setFromERK6Tensor13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setFrom"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setFromERK6Tensor13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setFrom::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setFromERK6Tensor13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setFrom::stream"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setZeroE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setZero"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setZeroE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setZero::stream"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6TensorD0Ev", "tensorrt_llm::executor::Tensor::~Tensor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor9TensorPtrE", "tensorrt_llm::executor::TensorPtr"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor11TokenIdTypeE", "tensorrt_llm::executor::TokenIdType"], [0, 2, 1, "_CPPv4I0_bEN12tensorrt_llm8executor10TypeTraitsE", "tensorrt_llm::executor::TypeTraits"], [0, 8, 1, "_CPPv4I0_bEN12tensorrt_llm8executor10TypeTraitsE", "tensorrt_llm::executor::TypeTraits::T"], [0, 2, 1, "_CPPv4I0EN12tensorrt_llm8executor10TypeTraitsIP1TEE", "tensorrt_llm::executor::TypeTraits&lt;T*&gt;"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor10TypeTraitsIP1TEE", "tensorrt_llm::executor::TypeTraits&lt;T*&gt;::T"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIP1TE5valueE", "tensorrt_llm::executor::TypeTraits&lt;T*&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsIbEE", "tensorrt_llm::executor::TypeTraits&lt;bool&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIbE5valueE", "tensorrt_llm::executor::TypeTraits&lt;bool&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsIfEE", "tensorrt_llm::executor::TypeTraits&lt;float&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIfE5valueE", "tensorrt_llm::executor::TypeTraits&lt;float&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsI4halfEE", "tensorrt_llm::executor::TypeTraits&lt;half&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsI4halfE5valueE", "tensorrt_llm::executor::TypeTraits&lt;half&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7int32_tEEE", "tensorrt_llm::executor::TypeTraits&lt;std::int32_t&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7int32_tEE5valueE", "tensorrt_llm::executor::TypeTraits&lt;std::int32_t&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7int64_tEEE", "tensorrt_llm::executor::TypeTraits&lt;std::int64_t&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7int64_tEE5valueE", "tensorrt_llm::executor::TypeTraits&lt;std::int64_t&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt6int8_tEEE", "tensorrt_llm::executor::TypeTraits&lt;std::int8_t&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt6int8_tEE5valueE", "tensorrt_llm::executor::TypeTraits&lt;std::int8_t&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7uint8_tEEE", "tensorrt_llm::executor::TypeTraits&lt;std::uint8_t&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7uint8_tEE5valueE", "tensorrt_llm::executor::TypeTraits&lt;std::uint8_t&gt;::value"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor11VecLogProbsE", "tensorrt_llm::executor::VecLogProbs"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor16VecTokenExtraIdsE", "tensorrt_llm::executor::VecTokenExtraIds"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor9VecTokensE", "tensorrt_llm::executor::VecTokens"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6detailE", "tensorrt_llm::executor::detail"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6detail9DimType64E", "tensorrt_llm::executor::detail::DimType64"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::detail::ofITensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::detail::ofITensor::tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6detail9toITensorERK6Tensor", "tensorrt_llm::executor::detail::toITensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6detail9toITensorERK6Tensor", "tensorrt_llm::executor::detail::toITensor::tensor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executorE", "tensorrt_llm::executor::disagg_executor"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestratorE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::ctxEnginePaths"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::ctxExecutorConfigs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::genEnginePaths"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::genExecutorConfigs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::hasContextAwaitThreads"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::hasGenAwaitThreads"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator21awaitContextResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitContextResponses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator21awaitContextResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitContextResponses::contextIdx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator21awaitContextResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitContextResponses::timeout"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator24awaitGenerationResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitGenerationResponses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator24awaitGenerationResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitGenerationResponses::genIdx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator24awaitGenerationResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitGenerationResponses::timeout"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator10canEnqueueEv", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::canEnqueue"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueContext::batch"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueContext::requests"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueContext::selectContextId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration::batch"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration::globalRequestIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration::requests"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration::selectGenIdx"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator19getContextExecutorsEv", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::getContextExecutors"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator15getGenExecutorsEv", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::getGenExecutors"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator5mImplE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::mImpl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestratorD0Ev", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::~DisaggExecutorOrchestrator"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdE", "tensorrt_llm::executor::disagg_executor::ResponseWithId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERK14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERKN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERR14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERRN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERKN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::gid"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERRN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::gid"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERK14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERR14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERKN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::response"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERRN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::response"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId3gidE", "tensorrt_llm::executor::disagg_executor::ResponseWithId::gid"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERK14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERR14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERK14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERR14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::operator=::other"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId8responseE", "tensorrt_llm::executor::disagg_executor::ResponseWithId::response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdD0Ev", "tensorrt_llm::executor::disagg_executor::ResponseWithId::~ResponseWithId"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cacheE", "tensorrt_llm::executor::kv_cache"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cacheE", "tensorrt_llm::executor::kv_cache"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cacheE", "tensorrt_llm::executor::kv_cache"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheStateE", "tensorrt_llm::executor::kv_cache::CacheState"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfigE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig15AttentionConfigE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::AttentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig15AttentionConfigE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::AttentionConfig::attentionType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig15AttentionConfigE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::AttentionConfig::kvFactor"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig14mAttentionTypeE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::mAttentionType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig9mKvFactorE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::mKvFactor"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionTypeE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionType8kDEFAULTE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionType::kDEFAULT"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionType4kMLAE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionType::kMLA"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::DPrank"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::DPrank"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::DPsize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::DPsize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::attentionType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::attentionType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::attentionType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::dataType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::dataType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::dataType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::enableAttentionDP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::enableAttentionDP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::kvFactor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::kvFactor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::kvFactor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::modelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::nbAttentionLayers"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::nbKvHeadPerLayer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::nbKvHeads"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::pipelineParallelism"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::pipelineParallelism"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::sizePerHead"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::sizePerHead"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::tensorParallelism"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::tensorParallelism"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::tokensPerBlock"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::tokensPerBlock"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::worldConfig"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigE", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig18mNbKvHeadsPerLayerE", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::mNbKvHeadsPerLayer"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig12mSizePerHeadE", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::mSizePerHead"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig15mTokensPerBlockE", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::mTokensPerBlock"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigeqERK11ModelConfig", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigeqERK11ModelConfig", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig7mDPrankE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mDPrank"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig7mDPsizeE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mDPsize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig18mEnableAttentionDPE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mEnableAttentionDP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig20mPipelineParallelismE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mPipelineParallelism"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig18mTensorParallelismE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mTensorParallelism"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigeqERK14ParallelConfig", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigeqERK14ParallelConfig", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState18getAttentionConfigEv", "tensorrt_llm::executor::kv_cache::CacheState::getAttentionConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11getDataTypeEv", "tensorrt_llm::executor::kv_cache::CacheState::getDataType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14getModelConfigEv", "tensorrt_llm::executor::kv_cache::CacheState::getModelConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState17getParallelConfigEv", "tensorrt_llm::executor::kv_cache::CacheState::getParallelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState16mAttentionConfigE", "tensorrt_llm::executor::kv_cache::CacheState::mAttentionConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState9mDataTypeE", "tensorrt_llm::executor::kv_cache::CacheState::mDataType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState12mModelConfigE", "tensorrt_llm::executor::kv_cache::CacheState::mModelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15mParallelConfigE", "tensorrt_llm::executor::kv_cache::CacheState::mParallelConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheStateeqERKN8kv_cache10CacheStateE", "tensorrt_llm::executor::kv_cache::CacheState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheStateeqERKN8kv_cache10CacheStateE", "tensorrt_llm::executor::kv_cache::CacheState::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState8toStringEv", "tensorrt_llm::executor::kv_cache::CacheState::toString"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommStateE", "tensorrt_llm::executor::kv_cache::CommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi", "tensorrt_llm::executor::kv_cache::CommState::CommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi", "tensorrt_llm::executor::kv_cache::CommState::CommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE", "tensorrt_llm::executor::kv_cache::CommState::CommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateEv", "tensorrt_llm::executor::kv_cache::CommState::CommState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE", "tensorrt_llm::executor::kv_cache::CommState::CommState::ip"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE", "tensorrt_llm::executor::kv_cache::CommState::CommState::port"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::ranks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::selfIdx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::selfIdx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::socketState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv", "tensorrt_llm::executor::kv_cache::CommState::getMpiState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv", "tensorrt_llm::executor::kv_cache::CommState::getSelfIdx"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState14getSocketStateEv", "tensorrt_llm::executor::kv_cache::CommState::getSocketState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv", "tensorrt_llm::executor::kv_cache::CommState::isMpiState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13isSocketStateEv", "tensorrt_llm::executor::kv_cache::CommState::isSocketState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState8mSelfIdxE", "tensorrt_llm::executor::kv_cache::CommState::mSelfIdx"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState6mStateE", "tensorrt_llm::executor::kv_cache::CommState::mState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommStateeqERK9CommState", "tensorrt_llm::executor::kv_cache::CommState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommStateeqERK9CommState", "tensorrt_llm::executor::kv_cache::CommState::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState8toStringEv", "tensorrt_llm::executor::kv_cache::CommState::toString"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10ConnectionE", "tensorrt_llm::executor::kv_cache::Connection"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection12isThreadSafeEv", "tensorrt_llm::executor::kv_cache::Connection::isThreadSafe"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::Connection::recv"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::Connection::recv::ctx"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::Connection::recv::data"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::Connection::recv::size"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", "tensorrt_llm::executor::kv_cache::Connection::send"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", "tensorrt_llm::executor::kv_cache::Connection::send::ctx"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", "tensorrt_llm::executor::kv_cache::Connection::send::data"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", "tensorrt_llm::executor::kv_cache::Connection::send::size"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10ConnectionD0Ev", "tensorrt_llm::executor::kv_cache::Connection::~Connection"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManagerE", "tensorrt_llm::executor::kv_cache::ConnectionManager"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache17ConnectionManager12getCommStateEv", "tensorrt_llm::executor::kv_cache::ConnectionManager::getCommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager14getConnectionsERK9CommState", "tensorrt_llm::executor::kv_cache::ConnectionManager::getConnections"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager14getConnectionsERK9CommState", "tensorrt_llm::executor::kv_cache::ConnectionManager::getConnections::state"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::ConnectionManager::recvConnect"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::ConnectionManager::recvConnect::ctx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::ConnectionManager::recvConnect::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::ConnectionManager::recvConnect::size"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManagerD0Ev", "tensorrt_llm::executor::kv_cache::ConnectionManager::~ConnectionManager"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContextE", "tensorrt_llm::executor::kv_cache::DataContext"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext11DataContextEi", "tensorrt_llm::executor::kv_cache::DataContext::DataContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext11DataContextEi", "tensorrt_llm::executor::kv_cache::DataContext::DataContext::tag"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11DataContext6getTagEv", "tensorrt_llm::executor::kv_cache::DataContext::getTag"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext4mTagE", "tensorrt_llm::executor::kv_cache::DataContext::mTag"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache8MpiStateE", "tensorrt_llm::executor::kv_cache::MpiState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache8MpiState6mRanksE", "tensorrt_llm::executor::kv_cache::MpiState::mRanks"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiStateeqERK8MpiState", "tensorrt_llm::executor::kv_cache::MpiState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiStateeqERK8MpiState", "tensorrt_llm::executor::kv_cache::MpiState::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiState8toStringEv", "tensorrt_llm::executor::kv_cache::MpiState::toString"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketStateE", "tensorrt_llm::executor::kv_cache::SocketState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketState3mIpE", "tensorrt_llm::executor::kv_cache::SocketState::mIp"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketState5mPortE", "tensorrt_llm::executor::kv_cache::SocketState::mPort"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketStateeqERK11SocketState", "tensorrt_llm::executor::kv_cache::SocketState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketStateeqERK11SocketState", "tensorrt_llm::executor::kv_cache::SocketState::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketState8toStringEv", "tensorrt_llm::executor::kv_cache::SocketState::toString"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE21ContextChunkingPolicy", "tensorrt_llm::executor::operator&lt;&lt;"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE23CapacitySchedulerPolicy", "tensorrt_llm::executor::operator&lt;&lt;"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE21ContextChunkingPolicy", "tensorrt_llm::executor::operator&lt;&lt;::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE23CapacitySchedulerPolicy", "tensorrt_llm::executor::operator&lt;&lt;::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE21ContextChunkingPolicy", "tensorrt_llm::executor::operator&lt;&lt;::policy"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE23CapacitySchedulerPolicy", "tensorrt_llm::executor::operator&lt;&lt;::policy"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7versionEv", "tensorrt_llm::executor::version"], [1, 1, 1, "_CPPv4N12tensorrt_llm6layersE", "tensorrt_llm::layers"], [0, 1, 1, "_CPPv4N12tensorrt_llm3mpiE", "tensorrt_llm::mpi"], [0, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [0, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffersE", "tensorrt_llm::runtime::AllReduceBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::fakeBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::hiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE", "tensorrt_llm::runtime::AllReduceBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE", "tensorrt_llm::runtime::AllReduceBuffers::mAllReduceCommPtrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE", "tensorrt_llm::runtime::AllReduceBuffers::mIpcMemoryHandles"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataTypeE", "tensorrt_llm::runtime::BufferDataType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", "tensorrt_llm::runtime::BufferDataType::BufferDataType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", "tensorrt_llm::runtime::BufferDataType::BufferDataType::_unsigned"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", "tensorrt_llm::runtime::BufferDataType::BufferDataType::dataType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", "tensorrt_llm::runtime::BufferDataType::BufferDataType::pointer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType11getDataTypeEv", "tensorrt_llm::runtime::BufferDataType::getDataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType7getSizeEv", "tensorrt_llm::runtime::BufferDataType::getSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType13getSizeInBitsEv", "tensorrt_llm::runtime::BufferDataType::getSizeInBits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType9isPointerEv", "tensorrt_llm::runtime::BufferDataType::isPointer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType10isUnsignedEv", "tensorrt_llm::runtime::BufferDataType::isUnsigned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType15kTrtPointerTypeE", "tensorrt_llm::runtime::BufferDataType::kTrtPointerType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType9mDataTypeE", "tensorrt_llm::runtime::BufferDataType::mDataType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType8mPointerE", "tensorrt_llm::runtime::BufferDataType::mPointer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType9mUnsignedE", "tensorrt_llm::runtime::BufferDataType::mUnsigned"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataTypecvN8nvinfer18DataTypeEEv", "tensorrt_llm::runtime::BufferDataType::operator nvinfer1::DataType"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManagerE", "tensorrt_llm::runtime::BufferManager"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13BufferManagerE13CudaStreamPtrb", "tensorrt_llm::runtime::BufferManager::BufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13BufferManagerE13CudaStreamPtrb", "tensorrt_llm::runtime::BufferManager::BufferManager::stream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13BufferManagerE13CudaStreamPtrb", "tensorrt_llm::runtime::BufferManager::BufferManager::trimPool"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager14CudaMemPoolPtrE", "tensorrt_llm::runtime::BufferManager::CudaMemPoolPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13CudaStreamPtrE", "tensorrt_llm::runtime::BufferManager::CudaStreamPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10IBufferPtrE", "tensorrt_llm::runtime::BufferManager::IBufferPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10ITensorPtrE", "tensorrt_llm::runtime::BufferManager::ITensorPtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::dims"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::size"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::type"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv", "tensorrt_llm::runtime::BufferManager::copy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", "tensorrt_llm::runtime::BufferManager::copy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::dstType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::srcType"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7ITensor10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::T"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::T"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::T"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::dims"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::dims"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7ITensor10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7ITensor10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyBufferE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyBuffer"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyBufferE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyBuffer::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyBufferE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyBuffer::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyTensorE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyTensor"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyTensorE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyTensor::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyTensorE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyTensor::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager9getStreamEv", "tensorrt_llm::runtime::BufferManager::getStream"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu::dims"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu::size"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu::type"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu::type"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync::type"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::ipcNvls"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::ipcNvls::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::ipcNvls::ranks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::ipcNvls::type"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10kBYTE_TYPEE", "tensorrt_llm::runtime::BufferManager::kBYTE_TYPE"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager5mPoolE", "tensorrt_llm::runtime::BufferManager::mPool"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7mStreamE", "tensorrt_llm::runtime::BufferManager::mStream"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager9mTrimPoolE", "tensorrt_llm::runtime::BufferManager::mTrimPool"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager14memoryPoolFreeEv", "tensorrt_llm::runtime::BufferManager::memoryPoolFree"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager18memoryPoolReservedEv", "tensorrt_llm::runtime::BufferManager::memoryPoolReserved"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager16memoryPoolTrimToENSt6size_tE", "tensorrt_llm::runtime::BufferManager::memoryPoolTrimTo"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager16memoryPoolTrimToENSt6size_tE", "tensorrt_llm::runtime::BufferManager::memoryPoolTrimTo::size"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager14memoryPoolUsedEv", "tensorrt_llm::runtime::BufferManager::memoryPoolUsed"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned::type"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager6setMemER7IBuffer7int32_t", "tensorrt_llm::runtime::BufferManager::setMem"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager6setMemER7IBuffer7int32_t", "tensorrt_llm::runtime::BufferManager::setMem::buffer"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager6setMemER7IBuffer7int32_t", "tensorrt_llm::runtime::BufferManager::setMem::value"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager7setZeroER7IBuffer", "tensorrt_llm::runtime::BufferManager::setZero"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager7setZeroER7IBuffer", "tensorrt_llm::runtime::BufferManager::setZero::buffer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManagerD0Ev", "tensorrt_llm::runtime::BufferManager::~BufferManager"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime11BufferRangeE", "tensorrt_llm::runtime::BufferRange"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime11BufferRange4BaseE", "tensorrt_llm::runtime::BufferRange::Base"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI1UEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeERK7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tIXntNSt10is_const_vI1UEEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeER7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11BufferRange11BufferRangeEP1T9size_type", "tensorrt_llm::runtime::BufferRange::BufferRange"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI1UEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeERK7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange::U"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tIXntNSt10is_const_vI1UEEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeER7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange::U"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI1UEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeERK7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange::buffer"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tIXntNSt10is_const_vI1UEEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeER7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange::buffer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11BufferRange11BufferRangeEP1T9size_type", "tensorrt_llm::runtime::BufferRange::BufferRange::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11BufferRange11BufferRangeEP1T9size_type", "tensorrt_llm::runtime::BufferRange::BufferRange::size"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime11BufferRangeE", "tensorrt_llm::runtime::BufferRange::T"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEventE", "tensorrt_llm::runtime::CudaEvent"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventE7pointerb", "tensorrt_llm::runtime::CudaEvent::CudaEvent"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventEj", "tensorrt_llm::runtime::CudaEvent::CudaEvent"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventE7pointerb", "tensorrt_llm::runtime::CudaEvent::CudaEvent::event"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventEj", "tensorrt_llm::runtime::CudaEvent::CudaEvent::flags"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventE7pointerb", "tensorrt_llm::runtime::CudaEvent::CudaEvent::ownsEvent"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7DeleterE", "tensorrt_llm::runtime::CudaEvent::Deleter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEb", "tensorrt_llm::runtime::CudaEvent::Deleter::Deleter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEv", "tensorrt_llm::runtime::CudaEvent::Deleter::Deleter"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEb", "tensorrt_llm::runtime::CudaEvent::Deleter::Deleter::ownsEvent"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter10mOwnsEventE", "tensorrt_llm::runtime::CudaEvent::Deleter::mOwnsEvent"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent7DeleterclE7pointer", "tensorrt_llm::runtime::CudaEvent::Deleter::operator()"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent7DeleterclE7pointer", "tensorrt_llm::runtime::CudaEvent::Deleter::operator()::event"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent8EventPtrE", "tensorrt_llm::runtime::CudaEvent::EventPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent12element_typeE", "tensorrt_llm::runtime::CudaEvent::element_type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent3getEv", "tensorrt_llm::runtime::CudaEvent::get"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent6mEventE", "tensorrt_llm::runtime::CudaEvent::mEvent"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaEvent::pointer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent11synchronizeEv", "tensorrt_llm::runtime::CudaEvent::synchronize"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStreamE", "tensorrt_llm::runtime::CudaStream"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_t", "tensorrt_llm::runtime::CudaStream::CudaStream"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", "tensorrt_llm::runtime::CudaStream::CudaStream"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamEji", "tensorrt_llm::runtime::CudaStream::CudaStream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", "tensorrt_llm::runtime::CudaStream::CudaStream::device"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamEji", "tensorrt_llm::runtime::CudaStream::CudaStream::flags"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", "tensorrt_llm::runtime::CudaStream::CudaStream::ownsStream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamEji", "tensorrt_llm::runtime::CudaStream::CudaStream::priority"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_t", "tensorrt_llm::runtime::CudaStream::CudaStream::stream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", "tensorrt_llm::runtime::CudaStream::CudaStream::stream"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7DeleterE", "tensorrt_llm::runtime::CudaStream::Deleter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEb", "tensorrt_llm::runtime::CudaStream::Deleter::Deleter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEv", "tensorrt_llm::runtime::CudaStream::Deleter::Deleter"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEb", "tensorrt_llm::runtime::CudaStream::Deleter::Deleter::ownsStream"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter11mOwnsStreamE", "tensorrt_llm::runtime::CudaStream::Deleter::mOwnsStream"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream7DeleterclE12cudaStream_t", "tensorrt_llm::runtime::CudaStream::Deleter::operator()"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream7DeleterclE12cudaStream_t", "tensorrt_llm::runtime::CudaStream::Deleter::operator()::stream"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream9StreamPtrE", "tensorrt_llm::runtime::CudaStream::StreamPtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream3getEv", "tensorrt_llm::runtime::CudaStream::get"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream9getDeviceEv", "tensorrt_llm::runtime::CudaStream::getDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7mDeviceE", "tensorrt_llm::runtime::CudaStream::mDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7mStreamE", "tensorrt_llm::runtime::CudaStream::mStream"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordEN9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaStream::record"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordERK9CudaEvent", "tensorrt_llm::runtime::CudaStream::record"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordEN9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaStream::record::event"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordERK9CudaEvent", "tensorrt_llm::runtime::CudaStream::record::event"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream11synchronizeEv", "tensorrt_llm::runtime::CudaStream::synchronize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitEN9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaStream::wait"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitERK9CudaEvent", "tensorrt_llm::runtime::CudaStream::wait"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitEN9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaStream::wait::event"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitERK9CudaEvent", "tensorrt_llm::runtime::CudaStream::wait::event"], [1, 2, 1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", "tensorrt_llm::runtime::DataTypeTraits"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", "tensorrt_llm::runtime::DataTypeTraits::kDataType"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", "tensorrt_llm::runtime::DataTypeTraits::kIsPointer"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", "tensorrt_llm::runtime::DataTypeTraits::kIsUnsigned"], [1, 2, 1, "_CPPv4I_N8nvinfer18DataTypeE_bEN12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_bEN12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::kDataType"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_bEN12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::kUnsigned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::type"], [1, 2, 1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;"], [1, 8, 1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;::kUnsigned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kFLOAT&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kFLOAT&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kFLOAT&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kFLOAT&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kHALF&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kHALF&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kHALF&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kHALF&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32, true&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32, true&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32, true&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32, true&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT8&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT8&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT8&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT8&gt;::type"], [1, 2, 1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;"], [1, 8, 1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;::kUnsigned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;::type"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInputE", "tensorrt_llm::runtime::DecodingInput"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::endIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::logits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::maxAttentionWindow"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::maxLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::sinkTokenLength"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::acceptedLens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::acceptedPathIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::acceptedTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::chunkedContextNextTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::lastDraftLens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::lastDraftPaths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::lastDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::nextDraftLens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::nextDraftPaths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::nextDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::seqSlots"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs12acceptedLensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::acceptedLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15acceptedPathIdsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::acceptedPathIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14acceptedTokensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::acceptedTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs24chunkedContextNextTokensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::chunkedContextNextTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs13lastDraftLensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::lastDraftLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14lastDraftPathsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::lastDraftPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15lastDraftTokensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::lastDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs13nextDraftLensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::nextDraftLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14nextDraftPathsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::nextDraftPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15nextDraftTokensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::nextDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs8seqSlotsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::seqSlots"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15bestPathIndicesE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::bestPathIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15bestPathLengthsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::bestPathLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs16lastDraftIndicesE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::lastDraftIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15lastDraftTokensE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::lastDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs21lastGenerationLengthsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::lastGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs19lastPositionIdsBaseE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::lastPositionIdsBase"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs5masksE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::masks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs18maxGenLengthDeviceE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::maxGenLengthDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs16nextDraftIndicesE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextDraftIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs14nextDraftProbsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextDraftProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15nextDraftTokensE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs14nextFlatTokensE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextFlatTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs21nextGenerationLengthsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs17packedPositionIdsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::packedPositionIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs8seqSlotsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::seqSlots"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs17constantThresholdE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::constantThreshold"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs11draftLogitsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::draftLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs10draftProbsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::draftProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs13draftTokenIdsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::draftTokenIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs14numDraftTokensE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::numDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs18numDraftTokensHostE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::numDraftTokensHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs4stepE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::step"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs11targetProbsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::targetProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs14useDraftLogitsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::useDraftLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs18useDraftLogitsHostE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::useDraftLogitsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs28useRandomAcceptanceThresholdE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::useRandomAcceptanceThreshold"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15LookaheadInputsE", "tensorrt_llm::runtime::DecodingInput::LookaheadInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15LookaheadInputs13tokensPerStepE", "tensorrt_llm::runtime::DecodingInput::LookaheadInputs::tokensPerStep"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputsE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs22medusaCurTokensPerStepE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaCurTokensPerStep"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs12medusaLogitsE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs11medusaPathsE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs25medusaTargetTokensPerStepE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaTargetTokensPerStep"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs13medusaTreeIdsE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaTreeIds"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14TensorConstPtrE", "tensorrt_llm::runtime::DecodingInput::TensorConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9TensorPtrE", "tensorrt_llm::runtime::DecodingInput::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12badWordsLensE", "tensorrt_llm::runtime::DecodingInput::badWordsLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13badWordsListsE", "tensorrt_llm::runtime::DecodingInput::badWordsLists"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12badWordsPtrsE", "tensorrt_llm::runtime::DecodingInput::badWordsPtrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9batchSizeE", "tensorrt_llm::runtime::DecodingInput::batchSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput10batchSlotsE", "tensorrt_llm::runtime::DecodingInput::batchSlots"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput10beamWidthsE", "tensorrt_llm::runtime::DecodingInput::beamWidths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput16cacheIndirectionE", "tensorrt_llm::runtime::DecodingInput::cacheIndirection"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11eagleInputsE", "tensorrt_llm::runtime::DecodingInput::eagleInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13embeddingBiasE", "tensorrt_llm::runtime::DecodingInput::embeddingBias"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput6endIdsE", "tensorrt_llm::runtime::DecodingInput::endIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25explicitDraftTokensInputsE", "tensorrt_llm::runtime::DecodingInput::explicitDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25externalDraftTokensInputsE", "tensorrt_llm::runtime::DecodingInput::externalDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13finishReasonsE", "tensorrt_llm::runtime::DecodingInput::finishReasons"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15generationStepsE", "tensorrt_llm::runtime::DecodingInput::generationSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput7lengthsE", "tensorrt_llm::runtime::DecodingInput::lengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput6logitsE", "tensorrt_llm::runtime::DecodingInput::logits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9logitsVecE", "tensorrt_llm::runtime::DecodingInput::logitsVec"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15lookaheadInputsE", "tensorrt_llm::runtime::DecodingInput::lookaheadInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput18maxAttentionWindowE", "tensorrt_llm::runtime::DecodingInput::maxAttentionWindow"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14maxBadWordsLenE", "tensorrt_llm::runtime::DecodingInput::maxBadWordsLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9maxLengthE", "tensorrt_llm::runtime::DecodingInput::maxLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15maxStopWordsLenE", "tensorrt_llm::runtime::DecodingInput::maxStopWordsLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12medusaInputsE", "tensorrt_llm::runtime::DecodingInput::medusaInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput17noRepeatNgramSizeE", "tensorrt_llm::runtime::DecodingInput::noRepeatNgramSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput19sequenceLimitLengthE", "tensorrt_llm::runtime::DecodingInput::sequenceLimitLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15sinkTokenLengthE", "tensorrt_llm::runtime::DecodingInput::sinkTokenLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput4stepE", "tensorrt_llm::runtime::DecodingInput::step"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13stopWordsLensE", "tensorrt_llm::runtime::DecodingInput::stopWordsLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14stopWordsListsE", "tensorrt_llm::runtime::DecodingInput::stopWordsLists"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13stopWordsPtrsE", "tensorrt_llm::runtime::DecodingInput::stopWordsPtrs"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutputE", "tensorrt_llm::runtime::DecodingOutput"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypothesesE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses10batchDonesE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::batchDones"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses14cumLogProbsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::cumLogProbsCBA"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5emptyERK13BufferManager", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::empty"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5emptyERK13BufferManager", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::empty::manager"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses4initERK13BufferManager11TokenIdType", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::init"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses4initERK13BufferManager11TokenIdType", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::init::endId"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses4initERK13BufferManager11TokenIdType", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::init::manager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses11logProbsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::logProbsCBA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses18minNormedScoresCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::minNormedScoresCBA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses15normedScoresCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::normedScoresCBA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses11numBeamsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::numBeamsCBA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses12outputIdsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::outputIdsCBA"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7releaseEv", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::release"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape::beamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape::maxSequenceLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses18sequenceLengthsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::sequenceLengthsCBA"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5sliceE10SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::slice"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5sliceE10SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::slice::batchIndex"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5sliceE10SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::slice::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14DecodingOutputE9TensorPtr9TensorPtr", "tensorrt_llm::runtime::DecodingOutput::DecodingOutput"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14DecodingOutputE9TensorPtr9TensorPtr", "tensorrt_llm::runtime::DecodingOutput::DecodingOutput::gatheredIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14DecodingOutputE9TensorPtr9TensorPtr", "tensorrt_llm::runtime::DecodingOutput::DecodingOutput::ids"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputsE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs21acceptedLengthsCumSumE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::acceptedLengthsCumSum"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs17acceptedTokensLenE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::acceptedTokensLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs15nextDraftTokensE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::nextDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs18nextDraftTokensLenE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::nextDraftTokensLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs12pathsOffsetsE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::pathsOffsets"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs18prevDraftTokensLenE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::prevDraftTokensLen"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9TensorPtrE", "tensorrt_llm::runtime::DecodingOutput::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14beamHypothesesE", "tensorrt_llm::runtime::DecodingOutput::beamHypotheses"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput16cacheIndirectionE", "tensorrt_llm::runtime::DecodingOutput::cacheIndirection"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11cumLogProbsE", "tensorrt_llm::runtime::DecodingOutput::cumLogProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput12eagleBuffersE", "tensorrt_llm::runtime::DecodingOutput::eagleBuffers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26explicitDraftTokensBuffersE", "tensorrt_llm::runtime::DecodingOutput::explicitDraftTokensBuffers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput13finishReasonsE", "tensorrt_llm::runtime::DecodingOutput::finishReasons"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11finishedSumE", "tensorrt_llm::runtime::DecodingOutput::finishedSum"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11gatheredIdsE", "tensorrt_llm::runtime::DecodingOutput::gatheredIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput3idsE", "tensorrt_llm::runtime::DecodingOutput::ids"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput17kNegativeInfinityE", "tensorrt_llm::runtime::DecodingOutput::kNegativeInfinity"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput7lengthsE", "tensorrt_llm::runtime::DecodingOutput::lengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput8logProbsE", "tensorrt_llm::runtime::DecodingOutput::logProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput13logProbsTiledE", "tensorrt_llm::runtime::DecodingOutput::logProbsTiled"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput16lookaheadOutputsE", "tensorrt_llm::runtime::DecodingOutput::lookaheadOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9newTokensE", "tensorrt_llm::runtime::DecodingOutput::newTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14newTokensStepsE", "tensorrt_llm::runtime::DecodingOutput::newTokensSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput12newTokensVecE", "tensorrt_llm::runtime::DecodingOutput::newTokensVec"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9parentIdsE", "tensorrt_llm::runtime::DecodingOutput::parentIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26speculativeDecodingOutputsE", "tensorrt_llm::runtime::DecodingOutput::speculativeDecodingOutputs"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime20DeviceAllocationNvlsE", "tensorrt_llm::runtime::DeviceAllocationNvls"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls20DeviceAllocationNvlsEv", "tensorrt_llm::runtime::DeviceAllocationNvls::DeviceAllocationNvls"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime20DeviceAllocationNvlsE", "tensorrt_llm::runtime::DeviceAllocationNvls::T"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls9_capacityE", "tensorrt_llm::runtime::DeviceAllocationNvls::_capacity"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls7_handleE", "tensorrt_llm::runtime::DeviceAllocationNvls::_handle"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls4freeEv", "tensorrt_llm::runtime::DeviceAllocationNvls::free"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls11getCapacityEv", "tensorrt_llm::runtime::DeviceAllocationNvls::getCapacity"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls21getIpcUnicastPointersEv", "tensorrt_llm::runtime::DeviceAllocationNvls::getIpcUnicastPointers"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls19getMulticastPointerEv", "tensorrt_llm::runtime::DeviceAllocationNvls::getMulticastPointer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls17getUnicastPointerEv", "tensorrt_llm::runtime::DeviceAllocationNvls::getUnicastPointer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls5resetE6size_tNSt3setIiEE", "tensorrt_llm::runtime::DeviceAllocationNvls::reset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls5resetE6size_tNSt3setIiEE", "tensorrt_llm::runtime::DeviceAllocationNvls::reset::ranks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls5resetE6size_tNSt3setIiEE", "tensorrt_llm::runtime::DeviceAllocationNvls::reset::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvlsD0Ev", "tensorrt_llm::runtime::DeviceAllocationNvls::~DeviceAllocationNvls"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffersE", "tensorrt_llm::runtime::EagleBuffers"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9BufferPtrE", "tensorrt_llm::runtime::EagleBuffers::BufferPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::decodingConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::worldConfig"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputsE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs12acceptedLensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::acceptedLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs13acceptedPathsE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::acceptedPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs14acceptedTokensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::acceptedTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs24chunkedContextNextTokensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::chunkedContextNextTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs13nextDraftLensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::nextDraftLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs14nextDraftPathsE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::nextDraftPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs15nextDraftTokensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::nextDraftTokens"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7ITensorE", "tensorrt_llm::runtime::EagleBuffers::ITensor"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6InputsE", "tensorrt_llm::runtime::EagleBuffers::Inputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs22allLayersDraftTokenIdsE", "tensorrt_llm::runtime::EagleBuffers::Inputs::allLayersDraftTokenIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs33allLayersDraftTokenIdsPredecessorE", "tensorrt_llm::runtime::EagleBuffers::Inputs::allLayersDraftTokenIdsPredecessor"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs15allLayersScoresE", "tensorrt_llm::runtime::EagleBuffers::Inputs::allLayersScores"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs24chunkedContextNextTokensE", "tensorrt_llm::runtime::EagleBuffers::Inputs::chunkedContextNextTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create::maxNumSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs20currentExpandIndicesE", "tensorrt_llm::runtime::EagleBuffers::Inputs::currentExpandIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs9draftLensE", "tensorrt_llm::runtime::EagleBuffers::Inputs::draftLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs10draftPathsE", "tensorrt_llm::runtime::EagleBuffers::Inputs::draftPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs14draftPathsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::draftPathsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs11draftTokensE", "tensorrt_llm::runtime::EagleBuffers::Inputs::draftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs22dynamicTreeMaxTopKHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::dynamicTreeMaxTopKHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29eagleNetCtxContextLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetCtxContextLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs34eagleNetCtxPastKeyValueLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetCtxPastKeyValueLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27eagleNetCtxRequestTypesHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetCtxRequestTypesHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29eagleNetGenContextLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetGenContextLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs34eagleNetGenPastKeyValueLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetGenPastKeyValueLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27eagleNetGenRequestTypesHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetGenRequestTypesHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18inputGenTokensHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::inputGenTokensHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs14posteriorAlphaE", "tensorrt_llm::runtime::EagleBuffers::Inputs::posteriorAlpha"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18posteriorThresholdE", "tensorrt_llm::runtime::EagleBuffers::Inputs::posteriorThreshold"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs10prevScoresE", "tensorrt_llm::runtime::EagleBuffers::Inputs::prevScores"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs16randomDataSampleE", "tensorrt_llm::runtime::EagleBuffers::Inputs::randomDataSample"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs20randomDataValidationE", "tensorrt_llm::runtime::EagleBuffers::Inputs::randomDataValidation"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29specDecodingGenerationLengthsE", "tensorrt_llm::runtime::EagleBuffers::Inputs::specDecodingGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs33specDecodingGenerationLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::specDecodingGenerationLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs23specDecodingPackedMasksE", "tensorrt_llm::runtime::EagleBuffers::Inputs::specDecodingPackedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27specDecodingPositionOffsetsE", "tensorrt_llm::runtime::EagleBuffers::Inputs::specDecodingPositionOffsets"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs12temperaturesE", "tensorrt_llm::runtime::EagleBuffers::Inputs::temperatures"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18useDynamicTreeHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::useDynamicTreeHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs15useSpecDecodingE", "tensorrt_llm::runtime::EagleBuffers::Inputs::useSpecDecoding"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13LlmRequestPtrE", "tensorrt_llm::runtime::EagleBuffers::LlmRequestPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13RequestVectorE", "tensorrt_llm::runtime::EagleBuffers::RequestVector"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers10SizeType32E", "tensorrt_llm::runtime::EagleBuffers::SizeType32"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9TensorMapE", "tensorrt_llm::runtime::EagleBuffers::TensorMap"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9TensorPtrE", "tensorrt_llm::runtime::EagleBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers28chunkedContextNextTokensHostE", "tensorrt_llm::runtime::EagleBuffers::chunkedContextNextTokensHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers23cumSumGenerationLengthsE", "tensorrt_llm::runtime::EagleBuffers::cumSumGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12engineInputsE", "tensorrt_llm::runtime::EagleBuffers::engineInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13engineOutputsE", "tensorrt_llm::runtime::EagleBuffers::engineOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers18greedySamplingHostE", "tensorrt_llm::runtime::EagleBuffers::greedySamplingHost"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::insertInputTensors"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::insertInputTensors::inputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::insertInputTensors::outputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::insertInputTensors::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers26mDefaultPosteriorThresholdE", "tensorrt_llm::runtime::EagleBuffers::mDefaultPosteriorThreshold"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers17mDoGreedySamplingE", "tensorrt_llm::runtime::EagleBuffers::mDoGreedySampling"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers19maxGenerationLengthE", "tensorrt_llm::runtime::EagleBuffers::maxGenerationLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers18posteriorAlphaHostE", "tensorrt_llm::runtime::EagleBuffers::posteriorAlphaHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers22posteriorThresholdHostE", "tensorrt_llm::runtime::EagleBuffers::posteriorThresholdHost"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::EagleBuffers::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::EagleBuffers::reshape::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::EagleBuffers::reshape::numCtxSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::EagleBuffers::reshape::numGenSequences"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers21scanReduceTempStorageE", "tensorrt_llm::runtime::EagleBuffers::scanReduceTempStorage"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers26scanReduceTempStorageBytesE", "tensorrt_llm::runtime::EagleBuffers::scanReduceTempStorageBytes"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::T"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::contextRequests"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::contextRequests"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::decoderBuffers"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::draftBuffers"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::eagleModule"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::genRequests"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::genRequests"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::manager"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::manager"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::modelConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::requestTypes"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::vocabSizePadded"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::worldConfig"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModuleE", "tensorrt_llm::runtime::EagleModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleEv", "tensorrt_llm::runtime::EagleModule::EagleModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule::maxDecodingDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule::maxDraftPathLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule::maxNonLeafNodesPerLayer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule::numTransformersLayer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule22getDefaultEagleChoicesEv", "tensorrt_llm::runtime::EagleModule::getDefaultEagleChoices"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule26getMaxNonLeafNodesPerLayerEv", "tensorrt_llm::runtime::EagleModule::getMaxNonLeafNodesPerLayer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule23getNumTransformerLayersEv", "tensorrt_llm::runtime::EagleModule::getNumTransformerLayers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule20mDefaultEagleChoicesE", "tensorrt_llm::runtime::EagleModule::mDefaultEagleChoices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule24mMaxNonLeafNodesPerLayerE", "tensorrt_llm::runtime::EagleModule::mMaxNonLeafNodesPerLayer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule21mNumTransformersLayerE", "tensorrt_llm::runtime::EagleModule::mNumTransformersLayer"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffersE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9BufferPtrE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::BufferPtr"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputs15positionOffsetsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs::positionOffsets"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputs18requestTypesDeviceE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs::requestTypesDevice"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15bestPathIndicesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::bestPathIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15bestPathLengthsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::bestPathLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs5masksE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::masks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs11maxGenTokenE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::maxGenToken"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs16nextDraftIndicesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextDraftIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs14nextDraftProbsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextDraftProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15nextDraftTokensE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs14nextFlatTokensE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextFlatTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs21nextGenerationLengthsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs19nextPositionOffsetsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextPositionOffsets"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs17packedPositionIdsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::packedPositionIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs13totalGenTokenE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::totalGenToken"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7ITensorE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ITensor"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6InputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create::maxNumSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs12draftIndicesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::draftIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs10draftProbsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::draftProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11draftTokensE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::draftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs17generationLengthsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::generationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs21generationLengthsHostE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::generationLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs16maxGenLengthHostE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::maxGenLengthHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11packedMasksE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::packedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11positionIdsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::positionIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs15positionIdsBaseE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::positionIdsBase"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs16randomDataSampleE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::randomDataSample"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs20randomDataValidationE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::randomDataValidation"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs12temperaturesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::temperatures"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs15useSpecDecodingE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::useSpecDecoding"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers10SizeType32E", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::SizeType32"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9TensorMapE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::TensorMap"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9TensorPtrE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers23cumSumGenerationLengthsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::cumSumGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12engineInputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::engineInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13engineOutputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::engineOutputs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::insertInputTensors"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::insertInputTensors::inputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::insertInputTensors::outputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::insertInputTensors::worldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::reshape::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::reshape::numCtxSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::reshape::numGenSequences"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers15scanTempStorageE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::scanTempStorage"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers20scanTempStorageBytesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::scanTempStorageBytes"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::T"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::contextPositionIds"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::contextPositionIds"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::decoderBuffers"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::draftBuffers"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::explicitDraftTokensModule"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::manager"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::modelConfig"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::numCtxSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::numCtxSequences"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::numGenSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::numGenSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::requestTypes"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::stream"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::stream"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::vocabSizePadded"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::worldConfig"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime25GenericPromptTuningParamsE", "tensorrt_llm::runtime::GenericPromptTuningParams"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::GenericPromptTuningParams::GenericPromptTuningParams"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::GenericPromptTuningParams::GenericPromptTuningParams::embeddingTable"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::GenericPromptTuningParams::GenericPromptTuningParams::tasks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::GenericPromptTuningParams::GenericPromptTuningParams::vocabSize"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams10SizeType32E", "tensorrt_llm::runtime::GenericPromptTuningParams::SizeType32"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime25GenericPromptTuningParamsE", "tensorrt_llm::runtime::GenericPromptTuningParams::TTensor"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams9TensorPtrE", "tensorrt_llm::runtime::GenericPromptTuningParams::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams14embeddingTableE", "tensorrt_llm::runtime::GenericPromptTuningParams::embeddingTable"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams19promptTuningEnabledE", "tensorrt_llm::runtime::GenericPromptTuningParams::promptTuningEnabled"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams5tasksE", "tensorrt_llm::runtime::GenericPromptTuningParams::tasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams9vocabSizeE", "tensorrt_llm::runtime::GenericPromptTuningParams::vocabSize"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime10GptDecoderE", "tensorrt_llm::runtime::GptDecoder"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13CudaStreamPtrE", "tensorrt_llm::runtime::GptDecoder::CudaStreamPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::mode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::speculativeDecodingModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::stream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::vocabSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::vocabSizePadded"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime10GptDecoderE", "tensorrt_llm::runtime::GptDecoder::T"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder9TensorPtrE", "tensorrt_llm::runtime::GptDecoder::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::GptDecoder::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::GptDecoder::disableLookahead::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::GptDecoder::disableLookahead::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::GptDecoder::disableLookahead::samplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardAsync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardAsync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardAsync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardSync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardSync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardSync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder17getSamplingConfigEv", "tensorrt_llm::runtime::GptDecoder::getSamplingConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder23mDecodingLayerWorkspaceE", "tensorrt_llm::runtime::GptDecoder::mDecodingLayerWorkspace"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13mDecodingModeE", "tensorrt_llm::runtime::GptDecoder::mDecodingMode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder19mDynamicDecodeLayerE", "tensorrt_llm::runtime::GptDecoder::mDynamicDecodeLayer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder8mManagerE", "tensorrt_llm::runtime::GptDecoder::mManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13mMaxBatchSizeE", "tensorrt_llm::runtime::GptDecoder::mMaxBatchSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder15mSamplingConfigE", "tensorrt_llm::runtime::GptDecoder::mSamplingConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10mVocabSizeE", "tensorrt_llm::runtime::GptDecoder::mVocabSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16mVocabSizePaddedE", "tensorrt_llm::runtime::GptDecoder::mVocabSizePadded"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::requests"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::samplingConfig"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatchedE", "tensorrt_llm::runtime::GptDecoderBatched"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13CudaStreamPtrE", "tensorrt_llm::runtime::GptDecoderBatched::CudaStreamPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched::speculativeDecodingMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched::stream"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13GptDecoderPtrE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13LlmRequestPtrE", "tensorrt_llm::runtime::GptDecoderBatched::LlmRequestPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13RequestVectorE", "tensorrt_llm::runtime::GptDecoderBatched::RequestVector"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14SharedConstPtrE", "tensorrt_llm::runtime::GptDecoderBatched::SharedConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched9TensorPtrE", "tensorrt_llm::runtime::GptDecoderBatched::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::GptDecoderBatched::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::GptDecoderBatched::disableLookahead::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::GptDecoderBatched::disableLookahead::genRequests"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize::batchSlot"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize::decoderState"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize::samplingConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize::streaming"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forward"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forward::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forward::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardAsync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardAsync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardAsync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardDispatch"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardDispatch::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardDispatch::output"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getBufferManagerEv", "tensorrt_llm::runtime::GptDecoderBatched::getBufferManager"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv", "tensorrt_llm::runtime::GptDecoderBatched::getDecoderState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv", "tensorrt_llm::runtime::GptDecoderBatched::getDecoderState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getDecoderStreamEv", "tensorrt_llm::runtime::GptDecoderBatched::getDecoderStream"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched20getUnderlyingDecoderEv", "tensorrt_llm::runtime::GptDecoderBatched::getUnderlyingDecoder"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mBufferManagerE", "tensorrt_llm::runtime::GptDecoderBatched::mBufferManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched8mDecoderE", "tensorrt_llm::runtime::GptDecoderBatched::mDecoder"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13mDecoderStateE", "tensorrt_llm::runtime::GptDecoderBatched::mDecoderState"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mDecoderStreamE", "tensorrt_llm::runtime::GptDecoderBatched::mDecoderStream"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mRuntimeStreamE", "tensorrt_llm::runtime::GptDecoderBatched::mRuntimeStream"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::prepareForward"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::prepareForward::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::prepareForward::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::prepareForward::step"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::setEagleInputs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::setEagleInputs::input"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::setExplicitDraftTokensInputs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::setExplicitDraftTokensInputs::input"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxAttentionWindow"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxTokensPerStep"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::mode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::sinkTokenLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::worldConfig"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfigE", "tensorrt_llm::runtime::GptJsonConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::contextParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::gpusPerNode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::name"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::pipelineParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::precision"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::runtimeDefaults"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::tensorParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::version"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfig", "tensorrt_llm::runtime::GptJsonConfig::engineFilename"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfigRKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::engineFilename"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfigRKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::engineFilename::model"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfig", "tensorrt_llm::runtime::GptJsonConfig::engineFilename::worldConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfigRKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::engineFilename::worldConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig21getContextParallelismEv", "tensorrt_llm::runtime::GptJsonConfig::getContextParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14getGpusPerNodeEv", "tensorrt_llm::runtime::GptJsonConfig::getGpusPerNode"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14getModelConfigEv", "tensorrt_llm::runtime::GptJsonConfig::getModelConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig21getModelConfigMutableEv", "tensorrt_llm::runtime::GptJsonConfig::getModelConfigMutable"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig7getNameEv", "tensorrt_llm::runtime::GptJsonConfig::getName"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig22getPipelineParallelismEv", "tensorrt_llm::runtime::GptJsonConfig::getPipelineParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig12getPrecisionEv", "tensorrt_llm::runtime::GptJsonConfig::getPrecision"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig18getRuntimeDefaultsEv", "tensorrt_llm::runtime::GptJsonConfig::getRuntimeDefaults"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig20getTensorParallelismEv", "tensorrt_llm::runtime::GptJsonConfig::getTensorParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig10getVersionEv", "tensorrt_llm::runtime::GptJsonConfig::getVersion"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig12getWorldSizeEv", "tensorrt_llm::runtime::GptJsonConfig::getWorldSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig19mContextParallelismE", "tensorrt_llm::runtime::GptJsonConfig::mContextParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig12mGpusPerNodeE", "tensorrt_llm::runtime::GptJsonConfig::mGpusPerNode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig12mModelConfigE", "tensorrt_llm::runtime::GptJsonConfig::mModelConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5mNameE", "tensorrt_llm::runtime::GptJsonConfig::mName"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig20mPipelineParallelismE", "tensorrt_llm::runtime::GptJsonConfig::mPipelineParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig10mPrecisionE", "tensorrt_llm::runtime::GptJsonConfig::mPrecision"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig16mRuntimeDefaultsE", "tensorrt_llm::runtime::GptJsonConfig::mRuntimeDefaults"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig18mTensorParallelismE", "tensorrt_llm::runtime::GptJsonConfig::mTensorParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig8mVersionE", "tensorrt_llm::runtime::GptJsonConfig::mVersion"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt10filesystem4pathE", "tensorrt_llm::runtime::GptJsonConfig::parse"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::parse"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERNSt7istreamE", "tensorrt_llm::runtime::GptJsonConfig::parse"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::parse::json"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERNSt7istreamE", "tensorrt_llm::runtime::GptJsonConfig::parse::json"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt10filesystem4pathE", "tensorrt_llm::runtime::GptJsonConfig::parse::path"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime7IBufferE", "tensorrt_llm::runtime::IBuffer"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer8DataTypeE", "tensorrt_llm::runtime::IBuffer::DataType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7IBufferERK7IBuffer", "tensorrt_llm::runtime::IBuffer::IBuffer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7IBufferEv", "tensorrt_llm::runtime::IBuffer::IBuffer"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer14SharedConstPtrE", "tensorrt_llm::runtime::IBuffer::SharedConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer9SharedPtrE", "tensorrt_llm::runtime::IBuffer::SharedPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer14UniqueConstPtrE", "tensorrt_llm::runtime::IBuffer::UniqueConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer9UniquePtrE", "tensorrt_llm::runtime::IBuffer::UniquePtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", "tensorrt_llm::runtime::IBuffer::data"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataEv", "tensorrt_llm::runtime::IBuffer::data"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", "tensorrt_llm::runtime::IBuffer::data"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataEv", "tensorrt_llm::runtime::IBuffer::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", "tensorrt_llm::runtime::IBuffer::data::index"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", "tensorrt_llm::runtime::IBuffer::data::index"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer11getCapacityEv", "tensorrt_llm::runtime::IBuffer::getCapacity"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer11getDataTypeEv", "tensorrt_llm::runtime::IBuffer::getDataType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer15getDataTypeNameE8DataType", "tensorrt_llm::runtime::IBuffer::getDataTypeName"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer15getDataTypeNameEv", "tensorrt_llm::runtime::IBuffer::getDataTypeName"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer15getDataTypeNameE8DataType", "tensorrt_llm::runtime::IBuffer::getDataTypeName::dataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer13getMemoryTypeEv", "tensorrt_llm::runtime::IBuffer::getMemoryType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer17getMemoryTypeNameEv", "tensorrt_llm::runtime::IBuffer::getMemoryTypeName"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7getSizeEv", "tensorrt_llm::runtime::IBuffer::getSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer14getSizeInBytesEv", "tensorrt_llm::runtime::IBuffer::getSizeInBytes"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer10memoryTypeEPKv", "tensorrt_llm::runtime::IBuffer::memoryType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer10memoryTypeEPKv", "tensorrt_llm::runtime::IBuffer::memoryType::data"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBufferaSERK7IBuffer", "tensorrt_llm::runtime::IBuffer::operator="], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7releaseEv", "tensorrt_llm::runtime::IBuffer::release"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer6resizeENSt6size_tE", "tensorrt_llm::runtime::IBuffer::resize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer6resizeENSt6size_tE", "tensorrt_llm::runtime::IBuffer::resize::newSize"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::TConstPtr"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::buffer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::buffer"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::offset"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::offset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::offset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::offset"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::tensor"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7toBytesENSt6size_tE", "tensorrt_llm::runtime::IBuffer::toBytes"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7toBytesENSt6size_tE", "tensorrt_llm::runtime::IBuffer::toBytes::size"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtr", "tensorrt_llm::runtime::IBuffer::view"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::TConstPtr"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtr", "tensorrt_llm::runtime::IBuffer::view::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::tensor"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrRNSt6vectorI1TEE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrRNSt6vectorI1TEE", "tensorrt_llm::runtime::IBuffer::wrap::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::capacity"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::capacity"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::data"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::data"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::size"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::type"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrRNSt6vectorI1TEE", "tensorrt_llm::runtime::IBuffer::wrap::v"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBufferD0Ev", "tensorrt_llm::runtime::IBuffer::~IBuffer"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoderE", "tensorrt_llm::runtime::IGptDecoder"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder14TensorConstPtrE", "tensorrt_llm::runtime::IGptDecoder::TensorConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder9TensorPtrE", "tensorrt_llm::runtime::IGptDecoder::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::mode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::speculativeDecodingModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::stream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::vocabSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::vocabSizePadded"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::IGptDecoder::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::IGptDecoder::disableLookahead::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::IGptDecoder::disableLookahead::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::IGptDecoder::disableLookahead::samplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardAsync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardAsync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardAsync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardSync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardSync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardSync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder17getSamplingConfigEv", "tensorrt_llm::runtime::IGptDecoder::getSamplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::requests"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::samplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoderD0Ev", "tensorrt_llm::runtime::IGptDecoder::~IGptDecoder"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatchedE", "tensorrt_llm::runtime::IGptDecoderBatched"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13CudaStreamPtrE", "tensorrt_llm::runtime::IGptDecoderBatched::CudaStreamPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched18IGptDecoderBatchedEv", "tensorrt_llm::runtime::IGptDecoderBatched::IGptDecoderBatched"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13LlmRequestPtrE", "tensorrt_llm::runtime::IGptDecoderBatched::LlmRequestPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13RequestVectorE", "tensorrt_llm::runtime::IGptDecoderBatched::RequestVector"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched9TensorPtrE", "tensorrt_llm::runtime::IGptDecoderBatched::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::IGptDecoderBatched::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::IGptDecoderBatched::disableLookahead::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::IGptDecoderBatched::disableLookahead::genRequests"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize::batchSlot"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize::decoderState"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize::samplingConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize::streaming"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forward"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forward::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forward::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forwardAsync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forwardAsync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forwardAsync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxAttentionWindow"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxTokensPerStep"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::mode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::sinkTokenLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::worldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatchedD0Ev", "tensorrt_llm::runtime::IGptDecoderBatched::~IGptDecoderBatched"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime7ITensorE", "tensorrt_llm::runtime::ITensor"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9DimType64E", "tensorrt_llm::runtime::ITensor::DimType64"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7ITensorERK7ITensor", "tensorrt_llm::runtime::ITensor::ITensor"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7ITensorEv", "tensorrt_llm::runtime::ITensor::ITensor"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5ShapeE", "tensorrt_llm::runtime::ITensor::Shape"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor14SharedConstPtrE", "tensorrt_llm::runtime::ITensor::SharedConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9SharedPtrE", "tensorrt_llm::runtime::ITensor::SharedPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9TensorMapE", "tensorrt_llm::runtime::ITensor::TensorMap"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor14UniqueConstPtrE", "tensorrt_llm::runtime::ITensor::UniqueConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9UniquePtrE", "tensorrt_llm::runtime::ITensor::UniquePtr"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::TConstPtr"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::tensor"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8castSizeE6size_t", "tensorrt_llm::runtime::ITensor::castSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8castSizeE6size_t", "tensorrt_llm::runtime::ITensor::castSize::newSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8flattenNE9SharedPtrNSt7int64_tE", "tensorrt_llm::runtime::ITensor::flattenN"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8flattenNE9SharedPtrNSt7int64_tE", "tensorrt_llm::runtime::ITensor::flattenN::sliceN"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8flattenNE9SharedPtrNSt7int64_tE", "tensorrt_llm::runtime::ITensor::flattenN::tensor"], [1, 3, 1, "_CPPv4I_10SizeType32ENK12tensorrt_llm7runtime7ITensor12getDimensionE9DimType64v", "tensorrt_llm::runtime::ITensor::getDimension"], [1, 8, 1, "_CPPv4I_10SizeType32ENK12tensorrt_llm7runtime7ITensor12getDimensionE9DimType64v", "tensorrt_llm::runtime::ITensor::getDimension::n"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor8getShapeEv", "tensorrt_llm::runtime::ITensor::getShape"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9makeShapeERKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::makeShape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9makeShapeERKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::makeShape::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensoraSERK7ITensor", "tensorrt_llm::runtime::ITensor::operator="], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7reshapeERK5Shape", "tensorrt_llm::runtime::ITensor::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7reshapeERK5Shape", "tensorrt_llm::runtime::ITensor::reshape::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor6resizeENSt6size_tE", "tensorrt_llm::runtime::ITensor::resize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor6resizeENSt6size_tE", "tensorrt_llm::runtime::ITensor::resize::newSize"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor11shapeEqualsERK5ShapeRK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERKNSt16initializer_listI10SizeType32EE", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::T"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::count"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::count"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::dims"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::dims"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::lhs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor11shapeEqualsERK5ShapeRK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals::lhs"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals::other"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERKNSt16initializer_listI10SizeType32EE", "tensorrt_llm::runtime::ITensor::shapeEquals::other"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor11shapeEqualsERK5ShapeRK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals::rhs"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", "tensorrt_llm::runtime::ITensor::slice"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offset"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offset"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeE10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeE10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze::dim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze::dim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze::shape"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7stridesERK5Shape", "tensorrt_llm::runtime::ITensor::strides"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7stridesERK5Shape", "tensorrt_llm::runtime::ITensor::strides::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8toStringERK5Shape", "tensorrt_llm::runtime::ITensor::toString"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8toStringERK5Shape", "tensorrt_llm::runtime::ITensor::toString::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeE10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeE10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze::dim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze::dim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze::shape"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::view"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewE9SharedPtr", "tensorrt_llm::runtime::ITensor::view"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewEN7IBuffer9SharedPtrERK5Shape", "tensorrt_llm::runtime::ITensor::view"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::view::TConstPtr"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewEN7IBuffer9SharedPtrERK5Shape", "tensorrt_llm::runtime::ITensor::view::buffer"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::view::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewEN7IBuffer9SharedPtrERK5Shape", "tensorrt_llm::runtime::ITensor::view::dims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::view::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewE9SharedPtr", "tensorrt_llm::runtime::ITensor::view::tensor"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor6volumeERK5Shape", "tensorrt_llm::runtime::ITensor::volume"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor6volumeERK5Shape", "tensorrt_llm::runtime::ITensor::volume::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor17volumeNonNegativeERK5Shape", "tensorrt_llm::runtime::ITensor::volumeNonNegative"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor17volumeNonNegativeERK5Shape", "tensorrt_llm::runtime::ITensor::volumeNonNegative::shape"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", "tensorrt_llm::runtime::ITensor::wrap"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", "tensorrt_llm::runtime::ITensor::wrap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", "tensorrt_llm::runtime::ITensor::wrap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", "tensorrt_llm::runtime::ITensor::wrap::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::capacity"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::capacity"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", "tensorrt_llm::runtime::ITensor::wrap::data"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::data"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::type"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::v"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensorD0Ev", "tensorrt_llm::runtime::ITensor::~ITensor"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryE", "tensorrt_llm::runtime::IpcMemory"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9BufferPtrE", "tensorrt_llm::runtime::IpcMemory::BufferPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory10FLAGS_SIZEE", "tensorrt_llm::runtime::IpcMemory::FLAGS_SIZE"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryERK9IpcMemory", "tensorrt_llm::runtime::IpcMemory::IpcMemory"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryERR9IpcMemory", "tensorrt_llm::runtime::IpcMemory::IpcMemory"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory::bufferSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory::openIpc"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory::worldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", "tensorrt_llm::runtime::IpcMemory::allocateIpcMemory"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", "tensorrt_llm::runtime::IpcMemory::allocateIpcMemory::bufferSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", "tensorrt_llm::runtime::IpcMemory::allocateIpcMemory::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", "tensorrt_llm::runtime::IpcMemory::allocateIpcMemory::worldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory16destroyIpcMemoryEv", "tensorrt_llm::runtime::IpcMemory::destroyIpcMemory"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9IpcMemory11getCommPtrsEv", "tensorrt_llm::runtime::IpcMemory::getCommPtrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory7mBufferE", "tensorrt_llm::runtime::IpcMemory::mBuffer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9mCommPtrsE", "tensorrt_llm::runtime::IpcMemory::mCommPtrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory8mOpenIpcE", "tensorrt_llm::runtime::IpcMemory::mOpenIpc"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory7mTpRankE", "tensorrt_llm::runtime::IpcMemory::mTpRank"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryaSERK9IpcMemory", "tensorrt_llm::runtime::IpcMemory::operator="], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryaSERR9IpcMemory", "tensorrt_llm::runtime::IpcMemory::operator="], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryD0Ev", "tensorrt_llm::runtime::IpcMemory::~IpcMemory"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandleE", "tensorrt_llm::runtime::IpcNvlsHandle"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle14ipc_uc_handlesE", "tensorrt_llm::runtime::IpcNvlsHandle::ipc_uc_handles"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle11ipc_uc_ptrsE", "tensorrt_llm::runtime::IpcNvlsHandle::ipc_uc_ptrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle10ipc_uc_vasE", "tensorrt_llm::runtime::IpcNvlsHandle::ipc_uc_vas"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle9mc_handleE", "tensorrt_llm::runtime::IpcNvlsHandle::mc_handle"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle6mc_ptrE", "tensorrt_llm::runtime::IpcNvlsHandle::mc_ptr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle5mc_vaE", "tensorrt_llm::runtime::IpcNvlsHandle::mc_va"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle4sizeE", "tensorrt_llm::runtime::IpcNvlsHandle::size"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle9uc_handleE", "tensorrt_llm::runtime::IpcNvlsHandle::uc_handle"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle6uc_ptrE", "tensorrt_llm::runtime::IpcNvlsHandle::uc_ptr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle5uc_vaE", "tensorrt_llm::runtime::IpcNvlsHandle::uc_va"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffersE", "tensorrt_llm::runtime::LookaheadDecodingBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", "tensorrt_llm::runtime::LookaheadDecodingBuffers::LookaheadDecodingBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", "tensorrt_llm::runtime::LookaheadDecodingBuffers::LookaheadDecodingBuffers::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", "tensorrt_llm::runtime::LookaheadDecodingBuffers::LookaheadDecodingBuffers::maxNumSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", "tensorrt_llm::runtime::LookaheadDecodingBuffers::LookaheadDecodingBuffers::maxTokensPerStep"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers9TensorPtrE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers17generationLengthsE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::generationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers11packedMasksE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::packedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers11positionIdsE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::positionIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers15positionOffsetsE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::positionOffsets"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModuleE", "tensorrt_llm::runtime::LookaheadModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadModule::LookaheadModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleEv", "tensorrt_llm::runtime::LookaheadModule::LookaheadModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadModule::LookaheadModule::maxDecodingDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadModule::LookaheadModule::maxDraftPathLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime15LookaheadModule18getExecutionConfigEv", "tensorrt_llm::runtime::LookaheadModule::getExecutionConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule16mExecutionConfigE", "tensorrt_llm::runtime::LookaheadModule::mExecutionConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule18setExecutionConfigERKN8executor23LookaheadDecodingConfigE", "tensorrt_llm::runtime::LookaheadModule::setExecutionConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule18setExecutionConfigERKN8executor23LookaheadDecodingConfigE", "tensorrt_llm::runtime::LookaheadModule::setExecutionConfig::config"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffersE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::decodingConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::runtime"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers9TensorMapE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::TensorMap"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers9TensorPtrE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers18batchSlotsHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::batchSlotsHostCopy"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers12cumSumLengthE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::cumSumLength"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers24disableLookaheadDecodingEv", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::disableLookaheadDecoding"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23enableLookaheadDecodingE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::enableLookaheadDecoding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23enableLookaheadDecodingE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::enableLookaheadDecoding::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23enableLookaheadDecodingE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::enableLookaheadDecoding::tokensPerStep"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23generationLengthsDeviceE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::generationLengthsDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers21generationLengthsHostE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::generationLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers25generationLengthsHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::generationLengthsHostCopy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::insertInputTensors"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::insertInputTensors::inputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::insertInputTensors::outputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::insertInputTensors::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers14packedMaskHostE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::packedMaskHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers18packedMaskHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::packedMaskHostCopy"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers17packedMasksDeviceE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::packedMasksDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers17positionIdsDeviceE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionIdsDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers15positionIdsHostE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionIdsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers19positionIdsHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionIdsHostCopy"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers21positionOffsetsDeviceE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionOffsetsDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers19positionOffsetsHostE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionOffsetsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23positionOffsetsHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionOffsetsHostCopy"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::reshape::numCtxSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::reshape::numGenSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::reshape::tokensPerStep"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::decoderLookaheadBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::modelConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::numCtxSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::numGenSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::requestTypes"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::runtime"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers15useSpecDecodingE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::useSpecDecoding"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCacheE", "tensorrt_llm::runtime::LoraCache"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache::pageManagerConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache10TaskIdTypeE", "tensorrt_llm::runtime::LoraCache::TaskIdType"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig11adapterSizeE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::adapterSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig6inSizeE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::inSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7layerIdE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::layerId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8moduleIdE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::moduleId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8numSlotsE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::numSlots"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigeqERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::operator=="], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigeqERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::operator==::o"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7outSizeE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::outSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig6pageIdE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::pageId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig17scalingVecPointerE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::scalingVecPointer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7slotIdxE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::slotIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8toStringEv", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::toString"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig16weightsInPointerE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::weightsInPointer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig17weightsOutPointerE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::weightsOutPointer"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache28TaskLayerModuleConfigListPtrE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfigListPtr"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueE", "tensorrt_llm::runtime::LoraCache::TaskValue"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERR9TaskValue", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueEv", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::configs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::done"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::inProgress"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::it"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::loadInProgress"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::loaded"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERR9TaskValue", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::o"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::pageIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue7configsE", "tensorrt_llm::runtime::LoraCache::TaskValue::configs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue4doneE", "tensorrt_llm::runtime::LoraCache::TaskValue::done"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue10inProgressE", "tensorrt_llm::runtime::LoraCache::TaskValue::inProgress"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue2itE", "tensorrt_llm::runtime::LoraCache::TaskValue::it"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue14loadInProgressE", "tensorrt_llm::runtime::LoraCache::TaskValue::loadInProgress"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue6loadedE", "tensorrt_llm::runtime::LoraCache::TaskValue::loaded"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueaSERR9TaskValue", "tensorrt_llm::runtime::LoraCache::TaskValue::operator="], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueaSERR9TaskValue", "tensorrt_llm::runtime::LoraCache::TaskValue::operator=::o"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue7pageIdsE", "tensorrt_llm::runtime::LoraCache::TaskValue::pageIds"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueD0Ev", "tensorrt_llm::runtime::LoraCache::TaskValue::~TaskValue"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12TaskValuePtrE", "tensorrt_llm::runtime::LoraCache::TaskValuePtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TensorPtrE", "tensorrt_llm::runtime::LoraCache::TensorPtr"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatusE", "tensorrt_llm::runtime::LoraCache::ValueStatus"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus20kVALUE_STATUS_LOADEDE", "tensorrt_llm::runtime::LoraCache::ValueStatus::kVALUE_STATUS_LOADED"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus21kVALUE_STATUS_MISSINGE", "tensorrt_llm::runtime::LoraCache::ValueStatus::kVALUE_STATUS_MISSING"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus24kVALUE_STATUS_PROCESSINGE", "tensorrt_llm::runtime::LoraCache::ValueStatus::kVALUE_STATUS_PROCESSING"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache4bumpE10TaskIdType", "tensorrt_llm::runtime::LoraCache::bump"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache4bumpE10TaskIdType", "tensorrt_llm::runtime::LoraCache::bump::taskId"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18bumpTaskInProgressE10TaskIdType", "tensorrt_llm::runtime::LoraCache::bumpTaskInProgress"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18bumpTaskInProgressE10TaskIdType", "tensorrt_llm::runtime::LoraCache::bumpTaskInProgress::taskId"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache19claimPagesWithEvictE10SizeType32", "tensorrt_llm::runtime::LoraCache::claimPagesWithEvict"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache19claimPagesWithEvictE10SizeType32", "tensorrt_llm::runtime::LoraCache::claimPagesWithEvict::numPages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", "tensorrt_llm::runtime::LoraCache::copyTask"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", "tensorrt_llm::runtime::LoraCache::copyTask::deviceCache"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", "tensorrt_llm::runtime::LoraCache::copyTask::markDone"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", "tensorrt_llm::runtime::LoraCache::copyTask::taskId"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages::sourceTaskValue"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages::targetCache"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages::targetPageIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages::targetTaskValue"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::config"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::moduleIdToModel"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::pageIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::pages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::weights"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::worldConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE10TaskIdType", "tensorrt_llm::runtime::LoraCache::determineNumPages"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE9TensorPtr", "tensorrt_llm::runtime::LoraCache::determineNumPages"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE9TensorPtr", "tensorrt_llm::runtime::LoraCache::determineNumPages::config"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE10TaskIdType", "tensorrt_llm::runtime::LoraCache::determineNumPages::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache4fitsE9TensorPtr", "tensorrt_llm::runtime::LoraCache::fits"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache4fitsE9TensorPtr", "tensorrt_llm::runtime::LoraCache::fits::config"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3getE10TaskIdType", "tensorrt_llm::runtime::LoraCache::get"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3getE10TaskIdType", "tensorrt_llm::runtime::LoraCache::get::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache11getNumPagesEv", "tensorrt_llm::runtime::LoraCache::getNumPages"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache10getPagePtrE6size_t", "tensorrt_llm::runtime::LoraCache::getPagePtr"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache10getPagePtrE6size_t", "tensorrt_llm::runtime::LoraCache::getPagePtr::pageId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache9getStatusE10TaskIdType", "tensorrt_llm::runtime::LoraCache::getStatus"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache9getStatusE10TaskIdType", "tensorrt_llm::runtime::LoraCache::getStatus::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache3hasE10TaskIdType", "tensorrt_llm::runtime::LoraCache::has"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache3hasE10TaskIdType", "tensorrt_llm::runtime::LoraCache::has::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache6isDoneE10TaskIdType", "tensorrt_llm::runtime::LoraCache::isDone"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache6isDoneE10TaskIdType", "tensorrt_llm::runtime::LoraCache::isDone::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache8isLoadedE10TaskIdType", "tensorrt_llm::runtime::LoraCache::isLoaded"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache8isLoadedE10TaskIdType", "tensorrt_llm::runtime::LoraCache::isLoaded::taskId"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus20kVALUE_STATUS_LOADEDE", "tensorrt_llm::runtime::LoraCache::kVALUE_STATUS_LOADED"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus21kVALUE_STATUS_MISSINGE", "tensorrt_llm::runtime::LoraCache::kVALUE_STATUS_MISSING"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus24kVALUE_STATUS_PROCESSINGE", "tensorrt_llm::runtime::LoraCache::kVALUE_STATUS_PROCESSING"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::cacheValue"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::config"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::config"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::taskId"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::weights"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::weights"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache14mBufferManagerE", "tensorrt_llm::runtime::LoraCache::mBufferManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9mCacheMapE", "tensorrt_llm::runtime::LoraCache::mCacheMap"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11mCacheMutexE", "tensorrt_llm::runtime::LoraCache::mCacheMutex"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17mCachePageManagerE", "tensorrt_llm::runtime::LoraCache::mCachePageManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21mDeviceBufferManagersE", "tensorrt_llm::runtime::LoraCache::mDeviceBufferManagers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache10mDoneTasksE", "tensorrt_llm::runtime::LoraCache::mDoneTasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16mInProgressTasksE", "tensorrt_llm::runtime::LoraCache::mInProgressTasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12mModelConfigE", "tensorrt_llm::runtime::LoraCache::mModelConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17mModuleIdToModuleE", "tensorrt_llm::runtime::LoraCache::mModuleIdToModule"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18mPageManagerConfigE", "tensorrt_llm::runtime::LoraCache::mPageManagerConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11mPagesMutexE", "tensorrt_llm::runtime::LoraCache::mPagesMutex"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12mWorldConfigE", "tensorrt_llm::runtime::LoraCache::mWorldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11markAllDoneEv", "tensorrt_llm::runtime::LoraCache::markAllDone"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12markTaskDoneE10TaskIdType", "tensorrt_llm::runtime::LoraCache::markTaskDone"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12markTaskDoneE10TaskIdType", "tensorrt_llm::runtime::LoraCache::markTaskDone::taskId"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put::config"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put::load"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put::taskId"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put::weights"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu::tpRank"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu::tpSize"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::input"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::output"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::tpRank"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::tpSize"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullExceptionE", "tensorrt_llm::runtime::LoraCacheFullException"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullException22LoraCacheFullExceptionERKNSt6stringE", "tensorrt_llm::runtime::LoraCacheFullException::LoraCacheFullException"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullException22LoraCacheFullExceptionERKNSt6stringE", "tensorrt_llm::runtime::LoraCacheFullException::LoraCacheFullException::msg"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullExceptionD0Ev", "tensorrt_llm::runtime::LoraCacheFullException::~LoraCacheFullException"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManagerE", "tensorrt_llm::runtime::LoraCachePageManager"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager20LoraCachePageManagerERK26LoraCachePageManagerConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::LoraCachePageManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager20LoraCachePageManagerERK26LoraCachePageManagerConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::LoraCachePageManager::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager20LoraCachePageManagerERK26LoraCachePageManagerConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::LoraCachePageManager::config"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager9TensorPtrE", "tensorrt_llm::runtime::LoraCachePageManager::TensorPtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager8blockPtrE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManager::blockPtr"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager8blockPtrE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManager::blockPtr::blockIdx"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10claimPagesE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManager::claimPages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10claimPagesE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManager::claimPages::numPages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10initializeERK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::initialize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10initializeERK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::initialize::bufferManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager7mConfigE", "tensorrt_llm::runtime::LoraCachePageManager::mConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12mFreePageIdsE", "tensorrt_llm::runtime::LoraCachePageManager::mFreePageIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager11mIsPageFreeE", "tensorrt_llm::runtime::LoraCachePageManager::mIsPageFree"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager11mPageBlocksE", "tensorrt_llm::runtime::LoraCachePageManager::mPageBlocks"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager14mutablePagePtrENSt6size_tE", "tensorrt_llm::runtime::LoraCachePageManager::mutablePagePtr"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager14mutablePagePtrENSt6size_tE", "tensorrt_llm::runtime::LoraCachePageManager::mutablePagePtr::pageIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager17numAvailablePagesEv", "tensorrt_llm::runtime::LoraCachePageManager::numAvailablePages"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager7pagePtrENSt6size_tE", "tensorrt_llm::runtime::LoraCachePageManager::pagePtr"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager7pagePtrENSt6size_tE", "tensorrt_llm::runtime::LoraCachePageManager::pagePtr::pageIdx"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12releasePagesERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCachePageManager::releasePages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12releasePagesERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCachePageManager::releasePages::pages"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfigE", "tensorrt_llm::runtime::LoraCachePageManagerConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::dType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::maxPagesPerBlock"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::memType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::numCopyStreams"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::pageWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::slotsPerPage"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::totalNumPages"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig11getDataTypeEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getDataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig13getInitToZeroEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getInitToZero"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig19getMaxPagesPerBlockEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getMaxPagesPerBlock"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig13getMemoryTypeEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getMemoryType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig17getNumCopyStreamsEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getNumCopyStreams"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig12getPageWidthEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getPageWidth"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig15getSlotsPerPageEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getSlotsPerPage"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig16getTotalNumPagesEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getTotalNumPages"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig9mDataTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mDataType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11mInitToZeroE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mInitToZero"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17mMaxPagesPerBlockE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mMaxPagesPerBlock"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11mMemoryTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mMemoryType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15mNumCopyStreamsE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mNumCopyStreams"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig10mPageWidthE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mPageWidth"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13mSlotsPerPageE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mSlotsPerPage"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig14mTotalNumPagesE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mTotalNumPages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11setDataTypeERKN8nvinfer18DataTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setDataType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11setDataTypeERKN8nvinfer18DataTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setDataType::dtype"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setInitToZeroEb", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setInitToZero"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setInitToZeroEb", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setInitToZero::initToZero"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig19setMaxPagesPerBlockERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setMaxPagesPerBlock"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig19setMaxPagesPerBlockERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setMaxPagesPerBlock::maxPagesPerBlock"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setMemoryTypeERKN7runtime10MemoryTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setMemoryType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setMemoryTypeERKN7runtime10MemoryTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setMemoryType::memoryType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17setNumCopyStreamsE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setNumCopyStreams"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17setNumCopyStreamsE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setNumCopyStreams::numCopyStreams"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig12setPageWidthERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setPageWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig12setPageWidthERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setPageWidth::pageWidth"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setSlotsPerPageERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setSlotsPerPage"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setSlotsPerPageERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setSlotsPerPage::slotsPerPage"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setTotalNumPageERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setTotalNumPage"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setTotalNumPageERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setTotalNumPage::totalNumPages"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedExceptionE", "tensorrt_llm::runtime::LoraExpectedException"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedException21LoraExpectedExceptionERKNSt6stringE", "tensorrt_llm::runtime::LoraExpectedException::LoraExpectedException"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedException21LoraExpectedExceptionERKNSt6stringE", "tensorrt_llm::runtime::LoraExpectedException::LoraExpectedException::msg"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedExceptionD0Ev", "tensorrt_llm::runtime::LoraExpectedException::~LoraExpectedException"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleE", "tensorrt_llm::runtime::LoraModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10LoraModule", "tensorrt_llm::runtime::LoraModule::LoraModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleEv", "tensorrt_llm::runtime::LoraModule::LoraModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::inDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::inDimFirst"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::inTpSplitDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10LoraModule", "tensorrt_llm::runtime::LoraModule::LoraModule::o"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::outDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::outDimFirst"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::outTpSplitDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::t"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleTypeE", "tensorrt_llm::runtime::LoraModule::ModuleType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kATTN_DENSEE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_DENSE"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_KE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_K"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_QE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_Q"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kATTN_QKVE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_QKV"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_VE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_V"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType17kCROSS_ATTN_DENSEE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_DENSE"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_KE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_K"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_QE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_Q"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType15kCROSS_ATTN_QKVE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_QKV"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_VE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_V"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType8kINVALIDE", "tensorrt_llm::runtime::LoraModule::ModuleType::kINVALID"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_4H_TO_HE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_4H_TO_H"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kMLP_GATEE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_GATE"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_GATE_UPE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_GATE_UP"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_H_TO_4HE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_H_TO_4H"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kMLP_ROUTERE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_ROUTER"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMOE_4H_TO_HE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMOE_4H_TO_H"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kMOE_GATEE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMOE_GATE"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMOE_H_TO_4HE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMOE_H_TO_4H"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kMOE_ROUTERE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMOE_ROUTER"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule9TensorPtrE", "tensorrt_llm::runtime::LoraModule::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::attentionHeadSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::hiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::loraModuleNames"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::mlpHiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::numAttentionHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::numExperts"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::numKvAttentionHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18flattenedInOutSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::flattenedInOutSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18flattenedInOutSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::flattenedInOutSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18flattenedInOutSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::flattenedInOutSize::isDora"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule5inDimEv", "tensorrt_llm::runtime::LoraModule::inDim"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10inDimFirstEv", "tensorrt_llm::runtime::LoraModule::inDimFirst"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6inSizeE10SizeType32", "tensorrt_llm::runtime::LoraModule::inSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6inSizeE10SizeType32", "tensorrt_llm::runtime::LoraModule::inSize::adapterSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12inTpSplitDimEv", "tensorrt_llm::runtime::LoraModule::inTpSplitDim"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18localInAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInAdapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18localInAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInAdapterSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18localInAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInAdapterSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10localInDimE10SizeType32", "tensorrt_llm::runtime::LoraModule::localInDim"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10localInDimE10SizeType32", "tensorrt_llm::runtime::LoraModule::localInDim::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localInOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInOutSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localInOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInOutSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localInOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInOutSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localInSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localInSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localInSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule19localOutAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutAdapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule19localOutAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutAdapterSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule19localOutAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutAdapterSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localOutDimE10SizeType32", "tensorrt_llm::runtime::LoraModule::localOutDim"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localOutDimE10SizeType32", "tensorrt_llm::runtime::LoraModule::localOutDim::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12localOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12localOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12localOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule15localScalesSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::localScalesSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule15localScalesSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::localScalesSize::isDora"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule15localScalesSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::localScalesSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::LoraModule::localTotalSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::LoraModule::localTotalSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::LoraModule::localTotalSize::isDora"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::LoraModule::localTotalSize::tpSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule6mInDimE", "tensorrt_llm::runtime::LoraModule::mInDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule11mInDimFirstE", "tensorrt_llm::runtime::LoraModule::mInDimFirst"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule13mInTpSplitDimE", "tensorrt_llm::runtime::LoraModule::mInTpSplitDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule7mOutDimE", "tensorrt_llm::runtime::LoraModule::mOutDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12mOutDimFirstE", "tensorrt_llm::runtime::LoraModule::mOutDimFirst"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule14mOutTpSplitDimE", "tensorrt_llm::runtime::LoraModule::mOutTpSplitDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule5mTypeE", "tensorrt_llm::runtime::LoraModule::mType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule4nameEv", "tensorrt_llm::runtime::LoraModule::name"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleaSERK10LoraModule", "tensorrt_llm::runtime::LoraModule::operator="], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleaSERK10LoraModule", "tensorrt_llm::runtime::LoraModule::operator=::o"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6outDimEv", "tensorrt_llm::runtime::LoraModule::outDim"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11outDimFirstEv", "tensorrt_llm::runtime::LoraModule::outDimFirst"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule7outSizeE10SizeType32", "tensorrt_llm::runtime::LoraModule::outSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule7outSizeE10SizeType32", "tensorrt_llm::runtime::LoraModule::outSize::adapterSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule13outTpSplitDimEv", "tensorrt_llm::runtime::LoraModule::outTpSplitDim"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10ModuleType", "tensorrt_llm::runtime::LoraModule::toModuleName"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10SizeType32", "tensorrt_llm::runtime::LoraModule::toModuleName"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10SizeType32", "tensorrt_llm::runtime::LoraModule::toModuleName::id"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10ModuleType", "tensorrt_llm::runtime::LoraModule::toModuleName::t"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleTypeERKNSt11string_viewE", "tensorrt_llm::runtime::LoraModule::toModuleType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleTypeERKNSt11string_viewE", "tensorrt_llm::runtime::LoraModule::toModuleType::name"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule5valueEv", "tensorrt_llm::runtime::LoraModule::value"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14LoraTaskIdTypeE", "tensorrt_llm::runtime::LoraTaskIdType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17MPI_group_barrierENSt3setIiEE", "tensorrt_llm::runtime::MPI_group_barrier"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17MPI_group_barrierENSt3setIiEE", "tensorrt_llm::runtime::MPI_group_barrier::ranks"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModuleE", "tensorrt_llm::runtime::MedusaModule"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule13MedusaChoicesE", "tensorrt_llm::runtime::MedusaModule::MedusaChoices"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::MedusaModule::MedusaModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleEv", "tensorrt_llm::runtime::MedusaModule::MedusaModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::MedusaModule::MedusaModule::maxAcceptedTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::MedusaModule::MedusaModule::maxDraftTokens"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule9TensorPtrE", "tensorrt_llm::runtime::MedusaModule::TensorPtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime12MedusaModule16getMedusaChoicesEv", "tensorrt_llm::runtime::MedusaModule::getMedusaChoices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule21mDefaultMedusaChoicesE", "tensorrt_llm::runtime::MedusaModule::mDefaultMedusaChoices"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCountersE", "tensorrt_llm::runtime::MemoryCounters"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8DiffTypeE", "tensorrt_llm::runtime::MemoryCounters::DiffType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters14MemoryCountersEv", "tensorrt_llm::runtime::MemoryCounters::MemoryCounters"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10SizeType32E", "tensorrt_llm::runtime::MemoryCounters::SizeType32"], [1, 3, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters8allocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8allocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate"], [1, 8, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters8allocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate::T"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8allocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate::memoryType"], [1, 4, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters8allocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8allocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE10SizeType32i", "tensorrt_llm::runtime::MemoryCounters::bytesToString"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE8DiffTypei", "tensorrt_llm::runtime::MemoryCounters::bytesToString"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE10SizeType32i", "tensorrt_llm::runtime::MemoryCounters::bytesToString::bytes"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE8DiffTypei", "tensorrt_llm::runtime::MemoryCounters::bytesToString::bytes"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE10SizeType32i", "tensorrt_llm::runtime::MemoryCounters::bytesToString::precision"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE8DiffTypei", "tensorrt_llm::runtime::MemoryCounters::bytesToString::precision"], [1, 3, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters10deallocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10deallocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate"], [1, 8, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters10deallocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate::T"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10deallocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate::memoryType"], [1, 4, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters10deallocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10deallocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate::size"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getCpuEv", "tensorrt_llm::runtime::MemoryCounters::getCpu"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getCpuDiffEv", "tensorrt_llm::runtime::MemoryCounters::getCpuDiff"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getGpuEv", "tensorrt_llm::runtime::MemoryCounters::getGpu"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getGpuDiffEv", "tensorrt_llm::runtime::MemoryCounters::getGpuDiff"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11getInstanceEv", "tensorrt_llm::runtime::MemoryCounters::getInstance"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters9getPinnedEv", "tensorrt_llm::runtime::MemoryCounters::getPinned"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters13getPinnedDiffEv", "tensorrt_llm::runtime::MemoryCounters::getPinnedDiff"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters13getPinnedPoolEv", "tensorrt_llm::runtime::MemoryCounters::getPinnedPool"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters17getPinnedPoolDiffEv", "tensorrt_llm::runtime::MemoryCounters::getPinnedPoolDiff"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getUVMEv", "tensorrt_llm::runtime::MemoryCounters::getUVM"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getUVMDiffEv", "tensorrt_llm::runtime::MemoryCounters::getUVMDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mCpuE", "tensorrt_llm::runtime::MemoryCounters::mCpu"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mCpuDiffE", "tensorrt_llm::runtime::MemoryCounters::mCpuDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mGpuE", "tensorrt_llm::runtime::MemoryCounters::mGpu"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mGpuDiffE", "tensorrt_llm::runtime::MemoryCounters::mGpuDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters7mPinnedE", "tensorrt_llm::runtime::MemoryCounters::mPinned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11mPinnedDiffE", "tensorrt_llm::runtime::MemoryCounters::mPinnedDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11mPinnedPoolE", "tensorrt_llm::runtime::MemoryCounters::mPinnedPool"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters15mPinnedPoolDiffE", "tensorrt_llm::runtime::MemoryCounters::mPinnedPoolDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mUVME", "tensorrt_llm::runtime::MemoryCounters::mUVM"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mUVMDiffE", "tensorrt_llm::runtime::MemoryCounters::mUVMDiff"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters8toStringEv", "tensorrt_llm::runtime::MemoryCounters::toString"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryTypeE", "tensorrt_llm::runtime::MemoryType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kCPUE", "tensorrt_llm::runtime::MemoryType::kCPU"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kGPUE", "tensorrt_llm::runtime::MemoryType::kGPU"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType7kPINNEDE", "tensorrt_llm::runtime::MemoryType::kPINNED"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType11kPINNEDPOOLE", "tensorrt_llm::runtime::MemoryType::kPINNEDPOOL"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kUVME", "tensorrt_llm::runtime::MemoryType::kUVM"], [1, 2, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime16MemoryTypeStringE", "tensorrt_llm::runtime::MemoryTypeString"], [1, 8, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime16MemoryTypeStringE", "tensorrt_llm::runtime::MemoryTypeString::T"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kCPUEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kCPU&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kCPUEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kCPU&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kGPUEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kGPU&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kGPUEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kGPU&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType7kPINNEDEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kPINNED&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType7kPINNEDEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kPINNED&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType11kPINNEDPOOLEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kPINNEDPOOL&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType11kPINNEDPOOLEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kPINNEDPOOL&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kUVMEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kUVM&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kUVMEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kUVM&gt;::value"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfigE", "tensorrt_llm::runtime::ModelConfig"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheTypeE", "tensorrt_llm::runtime::ModelConfig::KVCacheType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType11kCONTINUOUSE", "tensorrt_llm::runtime::ModelConfig::KVCacheType::kCONTINUOUS"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType9kDISABLEDE", "tensorrt_llm::runtime::ModelConfig::KVCacheType::kDISABLED"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType6kPAGEDE", "tensorrt_llm::runtime::ModelConfig::KVCacheType::kPAGED"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21KVCacheTypeFromStringENSt6stringE", "tensorrt_llm::runtime::ModelConfig::KVCacheTypeFromString"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21KVCacheTypeFromStringENSt6stringE", "tensorrt_llm::runtime::ModelConfig::KVCacheTypeFromString::value"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerTypeE", "tensorrt_llm::runtime::ModelConfig::LayerType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType10kATTENTIONE", "tensorrt_llm::runtime::ModelConfig::LayerType::kATTENTION"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType7kLINEARE", "tensorrt_llm::runtime::ModelConfig::LayerType::kLINEAR"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType5kNOOPE", "tensorrt_llm::runtime::ModelConfig::LayerType::kNOOP"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType10kRECURRENTE", "tensorrt_llm::runtime::ModelConfig::LayerType::kRECURRENT"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsTypeE", "tensorrt_llm::runtime::ModelConfig::ManageWeightsType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsType9kDisabledE", "tensorrt_llm::runtime::ModelConfig::ManageWeightsType::kDisabled"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsType8kEnabledE", "tensorrt_llm::runtime::ModelConfig::ManageWeightsType::kEnabled"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::hiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::nbAttentionLayers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::nbHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::nbLayers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::nbRnnLayers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::vocabSize"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariantE", "tensorrt_llm::runtime::ModelConfig::ModelVariant"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant8kChatGlmE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kChatGlm"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant7kEncDecE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kEncDec"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant4kGlmE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kGlm"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant4kGptE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kGpt"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant6kMambaE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kMamba"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant15kRecurrentGemmaE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kRecurrentGemma"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfigE", "tensorrt_llm::runtime::ModelConfig::RnnConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig10convKernelE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::convKernel"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig14rnnConvDimSizeE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::rnnConvDimSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig11rnnHeadSizeE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::rnnHeadSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig13rnnHiddenSizeE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::rnnHiddenSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig9stateSizeE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::stateSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEb", "tensorrt_llm::runtime::ModelConfig::computeContextLogits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEv", "tensorrt_llm::runtime::ModelConfig::computeContextLogits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEb", "tensorrt_llm::runtime::ModelConfig::computeContextLogits::computeContextLogits"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEb", "tensorrt_llm::runtime::ModelConfig::computeGenerationLogits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEv", "tensorrt_llm::runtime::ModelConfig::computeGenerationLogits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEb", "tensorrt_llm::runtime::ModelConfig::computeGenerationLogits::computeGenerationLogits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLocalLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLocalLayers::layerType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLocalLayers::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLocalLayers::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLowerRankLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLowerRankLayers::layerType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLowerRankLayers::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLowerRankLayers::pipelineParallelismRank"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig32disableSeamlessLookaheadDecodingEv", "tensorrt_llm::runtime::ModelConfig::disableSeamlessLookaheadDecoding"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31enableSeamlessLookaheadDecodingE10SizeType32", "tensorrt_llm::runtime::ModelConfig::enableSeamlessLookaheadDecoding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31enableSeamlessLookaheadDecodingE10SizeType32", "tensorrt_llm::runtime::ModelConfig::enableSeamlessLookaheadDecoding::maxDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getContextFMHAEv", "tensorrt_llm::runtime::ModelConfig::getContextFMHA"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getDataTypeEv", "tensorrt_llm::runtime::ModelConfig::getDataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getEncoderHiddenSizeEv", "tensorrt_llm::runtime::ModelConfig::getEncoderHiddenSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getGemmAllReduceDtypeEv", "tensorrt_llm::runtime::ModelConfig::getGemmAllReduceDtype"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getHiddenSizeEv", "tensorrt_llm::runtime::ModelConfig::getHiddenSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getKVCacheTypeEv", "tensorrt_llm::runtime::ModelConfig::getKVCacheType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getKvDataTypeEv", "tensorrt_llm::runtime::ModelConfig::getKvDataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getLayerTypesEv", "tensorrt_llm::runtime::ModelConfig::getLayerTypes"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getLogitsDtypeEv", "tensorrt_llm::runtime::ModelConfig::getLogitsDtype"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getLoraModulesEv", "tensorrt_llm::runtime::ModelConfig::getLoraModules"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getManageWeightsTypeEv", "tensorrt_llm::runtime::ModelConfig::getManageWeightsType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxBatchSizeEv", "tensorrt_llm::runtime::ModelConfig::getMaxBatchSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxBeamWidthEv", "tensorrt_llm::runtime::ModelConfig::getMaxBeamWidth"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig25getMaxDecodingDraftTokensEv", "tensorrt_llm::runtime::ModelConfig::getMaxDecodingDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getMaxDecodingTokensEv", "tensorrt_llm::runtime::ModelConfig::getMaxDecodingTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16getMaxEncoderLenEv", "tensorrt_llm::runtime::ModelConfig::getMaxEncoderLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getMaxInputLenEv", "tensorrt_llm::runtime::ModelConfig::getMaxInputLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getMaxLoraRankEv", "tensorrt_llm::runtime::ModelConfig::getMaxLoraRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxNumTokensEv", "tensorrt_llm::runtime::ModelConfig::getMaxNumTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig24getMaxPositionEmbeddingsEv", "tensorrt_llm::runtime::ModelConfig::getMaxPositionEmbeddings"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig30getMaxPromptEmbeddingTableSizeEv", "tensorrt_llm::runtime::ModelConfig::getMaxPromptEmbeddingTableSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17getMaxSequenceLenEv", "tensorrt_llm::runtime::ModelConfig::getMaxSequenceLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16getMlpHiddenSizeEv", "tensorrt_llm::runtime::ModelConfig::getMlpHiddenSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getModelNameEv", "tensorrt_llm::runtime::ModelConfig::getModelName"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getModelVariantEv", "tensorrt_llm::runtime::ModelConfig::getModelVariant"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbAttentionLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbAttentionLayers::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbAttentionLayers::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig10getNbHeadsEv", "tensorrt_llm::runtime::ModelConfig::getNbHeads"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getNbKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbKvHeads"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getNbKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbKvHeads::layerIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbLayers::pipelineParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbRnnLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbRnnLayers::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbRnnLayers::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getNumKvHeadsPerLayerEv", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayerLocalRange"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayerLocalRange::isCrossAttention"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayerLocalRange::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayerLocalRange::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getNumLanguagesEv", "tensorrt_llm::runtime::ModelConfig::getNumLanguages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig25getOptProfilesSplitPointsEv", "tensorrt_llm::runtime::ModelConfig::getOptProfilesSplitPoints"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19getPagedContextFMHAEv", "tensorrt_llm::runtime::ModelConfig::getPagedContextFMHA"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getPpReduceScatterEv", "tensorrt_llm::runtime::ModelConfig::getPpReduceScatter"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getQuantModeEv", "tensorrt_llm::runtime::ModelConfig::getQuantMode"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getRnnConfigEv", "tensorrt_llm::runtime::ModelConfig::getRnnConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getRotaryEmbeddingDimEv", "tensorrt_llm::runtime::ModelConfig::getRotaryEmbeddingDim"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getSizePerHeadEv", "tensorrt_llm::runtime::ModelConfig::getSizePerHead"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig26getSpeculativeDecodingModeEv", "tensorrt_llm::runtime::ModelConfig::getSpeculativeDecodingMode"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig28getSpeculativeDecodingModuleEv", "tensorrt_llm::runtime::ModelConfig::getSpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31getSpeculativeDecodingModulePtrEv", "tensorrt_llm::runtime::ModelConfig::getSpeculativeDecodingModulePtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getSpeculativeDecodingModulePtrEv", "tensorrt_llm::runtime::ModelConfig::getSpeculativeDecodingModulePtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getSumLocalKvHeads"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getSumLocalKvHeads::isCrossAttention"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getSumLocalKvHeads::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getSumLocalKvHeads::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17getTokensPerBlockEv", "tensorrt_llm::runtime::ModelConfig::getTokensPerBlock"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getVocabSizeEv", "tensorrt_llm::runtime::ModelConfig::getVocabSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getVocabSizePaddedE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getVocabSizePadded"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getVocabSizePaddedE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getVocabSizePadded::worldSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12hasRnnConfigEv", "tensorrt_llm::runtime::ModelConfig::hasRnnConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig28hasSpeculativeDecodingModuleEv", "tensorrt_llm::runtime::ModelConfig::hasSpeculativeDecodingModule"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19isContinuousKVCacheEv", "tensorrt_llm::runtime::ModelConfig::isContinuousKVCache"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16isKVCacheEnabledEv", "tensorrt_llm::runtime::ModelConfig::isKVCacheEnabled"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12isMultiModalEv", "tensorrt_llm::runtime::ModelConfig::isMultiModal"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14isPagedKVCacheEv", "tensorrt_llm::runtime::ModelConfig::isPagedKVCache"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig10isRnnBasedEv", "tensorrt_llm::runtime::ModelConfig::isRnnBased"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18isTransformerBasedEv", "tensorrt_llm::runtime::ModelConfig::isTransformerBased"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig9isWhisperEv", "tensorrt_llm::runtime::ModelConfig::isWhisper"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig29kDEFAULT_NUM_TOKENS_PER_BLOCKE", "tensorrt_llm::runtime::ModelConfig::kDEFAULT_NUM_TOKENS_PER_BLOCK"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26kOPT_PROFILES_SPLIT_POINTSE", "tensorrt_llm::runtime::ModelConfig::kOPT_PROFILES_SPLIT_POINTS"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mComputeContextLogitsE", "tensorrt_llm::runtime::ModelConfig::mComputeContextLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24mComputeGenerationLogitsE", "tensorrt_llm::runtime::ModelConfig::mComputeGenerationLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mContextFMHAE", "tensorrt_llm::runtime::ModelConfig::mContextFMHA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mDataTypeE", "tensorrt_llm::runtime::ModelConfig::mDataType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mEncoderHiddenSizeE", "tensorrt_llm::runtime::ModelConfig::mEncoderHiddenSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19mGemmAllReduceDtypeE", "tensorrt_llm::runtime::ModelConfig::mGemmAllReduceDtype"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mHiddenSizeE", "tensorrt_llm::runtime::ModelConfig::mHiddenSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mInputPackedE", "tensorrt_llm::runtime::ModelConfig::mInputPacked"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mKVCacheTypeE", "tensorrt_llm::runtime::ModelConfig::mKVCacheType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mLayerTypesE", "tensorrt_llm::runtime::ModelConfig::mLayerTypes"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mLogitsDtypeE", "tensorrt_llm::runtime::ModelConfig::mLogitsDtype"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mLoraModulesE", "tensorrt_llm::runtime::ModelConfig::mLoraModules"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mManageWeightsTypeE", "tensorrt_llm::runtime::ModelConfig::mManageWeightsType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxBatchSizeE", "tensorrt_llm::runtime::ModelConfig::mMaxBatchSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxBeamWidthE", "tensorrt_llm::runtime::ModelConfig::mMaxBeamWidth"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mMaxEncoderLenE", "tensorrt_llm::runtime::ModelConfig::mMaxEncoderLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mMaxInputLenE", "tensorrt_llm::runtime::ModelConfig::mMaxInputLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mMaxLoraRankE", "tensorrt_llm::runtime::ModelConfig::mMaxLoraRank"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxNumTokensE", "tensorrt_llm::runtime::ModelConfig::mMaxNumTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mMaxPositionEmbeddingsE", "tensorrt_llm::runtime::ModelConfig::mMaxPositionEmbeddings"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28mMaxPromptEmbeddingTableSizeE", "tensorrt_llm::runtime::ModelConfig::mMaxPromptEmbeddingTableSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15mMaxSequenceLenE", "tensorrt_llm::runtime::ModelConfig::mMaxSequenceLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mMlpHiddenSizeE", "tensorrt_llm::runtime::ModelConfig::mMlpHiddenSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mModelNameE", "tensorrt_llm::runtime::ModelConfig::mModelName"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mModelVariantE", "tensorrt_llm::runtime::ModelConfig::mModelVariant"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mNbAttentionLayersE", "tensorrt_llm::runtime::ModelConfig::mNbAttentionLayers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig8mNbHeadsE", "tensorrt_llm::runtime::ModelConfig::mNbHeads"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mNbLayersE", "tensorrt_llm::runtime::ModelConfig::mNbLayers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mNbRnnLayersE", "tensorrt_llm::runtime::ModelConfig::mNbRnnLayers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28mNumKvHeadsPerAttentionLayerE", "tensorrt_llm::runtime::ModelConfig::mNumKvHeadsPerAttentionLayer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig33mNumKvHeadsPerCrossAttentionLayerE", "tensorrt_llm::runtime::ModelConfig::mNumKvHeadsPerCrossAttentionLayer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mNumLanguagesE", "tensorrt_llm::runtime::ModelConfig::mNumLanguages"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17mPagedContextFMHAE", "tensorrt_llm::runtime::ModelConfig::mPagedContextFMHA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mPagedStateE", "tensorrt_llm::runtime::ModelConfig::mPagedState"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16mPpReduceScatterE", "tensorrt_llm::runtime::ModelConfig::mPpReduceScatter"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mQuantModeE", "tensorrt_llm::runtime::ModelConfig::mQuantMode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mRnnConfigE", "tensorrt_llm::runtime::ModelConfig::mRnnConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19mRotaryEmbeddingDimE", "tensorrt_llm::runtime::ModelConfig::mRotaryEmbeddingDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mSizePerHeadE", "tensorrt_llm::runtime::ModelConfig::mSizePerHead"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20mSkipCrossAttnBlocksE", "tensorrt_llm::runtime::ModelConfig::mSkipCrossAttnBlocks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24mSpeculativeDecodingModeE", "tensorrt_llm::runtime::ModelConfig::mSpeculativeDecodingMode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26mSpeculativeDecodingModuleE", "tensorrt_llm::runtime::ModelConfig::mSpeculativeDecodingModule"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15mTokensPerBlockE", "tensorrt_llm::runtime::ModelConfig::mTokensPerBlock"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mUseCrossAttentionE", "tensorrt_llm::runtime::ModelConfig::mUseCrossAttention"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23mUseGemmAllReducePluginE", "tensorrt_llm::runtime::ModelConfig::mUseGemmAllReducePlugin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mUseGptAttentionPluginE", "tensorrt_llm::runtime::ModelConfig::mUseGptAttentionPlugin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mUseLoraPluginE", "tensorrt_llm::runtime::ModelConfig::mUseLoraPlugin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mUseMambaConv1dPluginE", "tensorrt_llm::runtime::ModelConfig::mUseMambaConv1dPlugin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mUseMropeE", "tensorrt_llm::runtime::ModelConfig::mUseMrope"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mUsePositionEmbeddingE", "tensorrt_llm::runtime::ModelConfig::mUsePositionEmbedding"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mUseShapeInferenceE", "tensorrt_llm::runtime::ModelConfig::mUseShapeInference"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mUseTokenTypeEmbeddingE", "tensorrt_llm::runtime::ModelConfig::mUseTokenTypeEmbedding"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mVocabSizeE", "tensorrt_llm::runtime::ModelConfig::mVocabSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30resetSpeculativeDecodingModuleEv", "tensorrt_llm::runtime::ModelConfig::resetSpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setContextFMHAEb", "tensorrt_llm::runtime::ModelConfig::setContextFMHA"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setContextFMHAEb", "tensorrt_llm::runtime::ModelConfig::setContextFMHA::contextFMHA"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setEncoderHiddenSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setEncoderHiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setEncoderHiddenSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setEncoderHiddenSize::encoderHiddenSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setGemmAllReduceDtypeEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::setGemmAllReduceDtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setGemmAllReduceDtypeEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::setGemmAllReduceDtype::inputDtype"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setKVCacheTypeE11KVCacheType", "tensorrt_llm::runtime::ModelConfig::setKVCacheType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setKVCacheTypeE11KVCacheType", "tensorrt_llm::runtime::ModelConfig::setKVCacheType::kvCacheType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13setLayerTypesERKNSt6vectorI9LayerTypeEE", "tensorrt_llm::runtime::ModelConfig::setLayerTypes"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13setLayerTypesERKNSt6vectorI9LayerTypeEE", "tensorrt_llm::runtime::ModelConfig::setLayerTypes::layerTypes"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLogitsDtypeEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::setLogitsDtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLogitsDtypeEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::setLogitsDtype::inputDtype"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLoraModulesERKNSt6vectorI10LoraModuleEE", "tensorrt_llm::runtime::ModelConfig::setLoraModules"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLoraModulesERKNSt6vectorI10LoraModuleEE", "tensorrt_llm::runtime::ModelConfig::setLoraModules::loraModules"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setManageWeightsTypeEK17ManageWeightsType", "tensorrt_llm::runtime::ModelConfig::setManageWeightsType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setManageWeightsTypeEK17ManageWeightsType", "tensorrt_llm::runtime::ModelConfig::setManageWeightsType::manageWeightType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBatchSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBatchSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxBatchSize::maxBatchSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBeamWidthE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBeamWidthE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxBeamWidth::maxBeamWidth"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMaxEncoderLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxEncoderLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMaxEncoderLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxEncoderLen::maxEncoderLen"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxInputLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxInputLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxInputLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxInputLen::maxInputLen"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxLoraRankE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxLoraRank"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxLoraRankE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxLoraRank::maxLoraRank"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxNumTokensENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setMaxNumTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxNumTokensENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setMaxNumTokens::maxNumTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setMaxPositionEmbeddingsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxPositionEmbeddings"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setMaxPositionEmbeddingsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxPositionEmbeddings::maxPositionEmbeddings"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30setMaxPromptEmbeddingTableSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxPromptEmbeddingTableSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30setMaxPromptEmbeddingTableSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxPromptEmbeddingTableSize::maxPromptEmbeddingTableSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setMaxSequenceLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxSequenceLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setMaxSequenceLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxSequenceLen::maxSequenceLen"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMlpHiddenSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMlpHiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMlpHiddenSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMlpHiddenSize::mlpHiddenSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setModelNameERKNSt6stringE", "tensorrt_llm::runtime::ModelConfig::setModelName"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setModelNameERKNSt6stringE", "tensorrt_llm::runtime::ModelConfig::setModelName::modelName"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setModelVariantE12ModelVariant", "tensorrt_llm::runtime::ModelConfig::setModelVariant"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setModelVariantE12ModelVariant", "tensorrt_llm::runtime::ModelConfig::setModelVariant::modelVariant"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setNbCrossKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setNbCrossKvHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setNbCrossKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setNbCrossKvHeads::nbKvHeads"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setNbKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setNbKvHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setNbKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setNbKvHeads::nbKvHeads"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setNumKvHeadsPerCrossLayerERKNSt6vectorI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumKvHeadsPerCrossLayer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setNumKvHeadsPerCrossLayerERKNSt6vectorI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumKvHeadsPerCrossLayer::headsPerLayer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setNumKvHeadsPerLayerERKNSt6vectorI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumKvHeadsPerLayer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setNumKvHeadsPerLayerERKNSt6vectorI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumKvHeadsPerLayer::headsPerLayer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setNumLanguagesENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumLanguages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setNumLanguagesENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumLanguages::numLanguages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19setPagedContextFMHAEb", "tensorrt_llm::runtime::ModelConfig::setPagedContextFMHA"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19setPagedContextFMHAEb", "tensorrt_llm::runtime::ModelConfig::setPagedContextFMHA::pagedContextFMHA"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18setPpReduceScatterEb", "tensorrt_llm::runtime::ModelConfig::setPpReduceScatter"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18setPpReduceScatterEb", "tensorrt_llm::runtime::ModelConfig::setPpReduceScatter::ppReduceScatter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setQuantModeEN6common9QuantModeE", "tensorrt_llm::runtime::ModelConfig::setQuantMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setQuantModeEN6common9QuantModeE", "tensorrt_llm::runtime::ModelConfig::setQuantMode::QuantMode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setRnnConfigERK9RnnConfig", "tensorrt_llm::runtime::ModelConfig::setRnnConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setRnnConfigERK9RnnConfig", "tensorrt_llm::runtime::ModelConfig::setRnnConfig::rnnConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setRotaryEmbeddingDimE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setRotaryEmbeddingDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setRotaryEmbeddingDimE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setRotaryEmbeddingDim::rotaryEmbeddingDim"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setSizePerHeadE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setSizePerHead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setSizePerHeadE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setSizePerHead::sizePerHead"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22setSkipCrossAttnBlocksEb", "tensorrt_llm::runtime::ModelConfig::setSkipCrossAttnBlocks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22setSkipCrossAttnBlocksEb", "tensorrt_llm::runtime::ModelConfig::setSkipCrossAttnBlocks::skipCrossAttnBlocks"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setSpeculativeDecodingModeE23SpeculativeDecodingMode", "tensorrt_llm::runtime::ModelConfig::setSpeculativeDecodingMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setSpeculativeDecodingModeE23SpeculativeDecodingMode", "tensorrt_llm::runtime::ModelConfig::setSpeculativeDecodingMode::mode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28setSpeculativeDecodingModuleERKNSt10shared_ptrI25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::ModelConfig::setSpeculativeDecodingModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28setSpeculativeDecodingModuleERKNSt10shared_ptrI25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::ModelConfig::setSpeculativeDecodingModule::speculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setTokensPerBlockE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setTokensPerBlock"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setTokensPerBlockE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setTokensPerBlock::TokensPerBlock"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseCrossAttentionEb", "tensorrt_llm::runtime::ModelConfig::setUseCrossAttention"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseCrossAttentionEb", "tensorrt_llm::runtime::ModelConfig::setUseCrossAttention::useCrossAttention"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11setUseMropeEb", "tensorrt_llm::runtime::ModelConfig::setUseMrope"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11setUseMropeEb", "tensorrt_llm::runtime::ModelConfig::setUseMrope::useMrope"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23setUsePositionEmbeddingEb", "tensorrt_llm::runtime::ModelConfig::setUsePositionEmbedding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23setUsePositionEmbeddingEb", "tensorrt_llm::runtime::ModelConfig::setUsePositionEmbedding::usePositionEmbedding"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseShapeInferenceEb", "tensorrt_llm::runtime::ModelConfig::setUseShapeInference"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseShapeInferenceEb", "tensorrt_llm::runtime::ModelConfig::setUseShapeInference::useShapeInference"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setUseTokenTypeEmbeddingEb", "tensorrt_llm::runtime::ModelConfig::setUseTokenTypeEmbedding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setUseTokenTypeEmbeddingEb", "tensorrt_llm::runtime::ModelConfig::setUseTokenTypeEmbedding::useTokenTypeEmbedding"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19skipCrossAttnBlocksEv", "tensorrt_llm::runtime::ModelConfig::skipCrossAttnBlocks"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig24supportsInflightBatchingEv", "tensorrt_llm::runtime::ModelConfig::supportsInflightBatching"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17useCrossAttentionEv", "tensorrt_llm::runtime::ModelConfig::useCrossAttention"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEb", "tensorrt_llm::runtime::ModelConfig::useGemmAllReducePlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEv", "tensorrt_llm::runtime::ModelConfig::useGemmAllReducePlugin"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEb", "tensorrt_llm::runtime::ModelConfig::useGemmAllReducePlugin::useGemmAllReducePlugin"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEb", "tensorrt_llm::runtime::ModelConfig::useGptAttentionPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEv", "tensorrt_llm::runtime::ModelConfig::useGptAttentionPlugin"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEb", "tensorrt_llm::runtime::ModelConfig::useGptAttentionPlugin::useGptAttentionPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18useLanguageAdapterEv", "tensorrt_llm::runtime::ModelConfig::useLanguageAdapter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13useLoraPluginEb", "tensorrt_llm::runtime::ModelConfig::useLoraPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13useLoraPluginEv", "tensorrt_llm::runtime::ModelConfig::useLoraPlugin"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13useLoraPluginEb", "tensorrt_llm::runtime::ModelConfig::useLoraPlugin::useLoraPlugin"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEb", "tensorrt_llm::runtime::ModelConfig::useMambaConv1dPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEv", "tensorrt_llm::runtime::ModelConfig::useMambaConv1dPlugin"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEb", "tensorrt_llm::runtime::ModelConfig::useMambaConv1dPlugin::useMambaConv1dPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig8useMropeEv", "tensorrt_llm::runtime::ModelConfig::useMrope"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14usePackedInputEb", "tensorrt_llm::runtime::ModelConfig::usePackedInput"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14usePackedInputEv", "tensorrt_llm::runtime::ModelConfig::usePackedInput"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14usePackedInputEb", "tensorrt_llm::runtime::ModelConfig::usePackedInput::inputPacked"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13usePagedStateEb", "tensorrt_llm::runtime::ModelConfig::usePagedState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13usePagedStateEv", "tensorrt_llm::runtime::ModelConfig::usePagedState"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13usePagedStateEb", "tensorrt_llm::runtime::ModelConfig::usePagedState::pagedState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20usePositionEmbeddingEv", "tensorrt_llm::runtime::ModelConfig::usePositionEmbedding"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15usePromptTuningEv", "tensorrt_llm::runtime::ModelConfig::usePromptTuning"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17useShapeInferenceEv", "tensorrt_llm::runtime::ModelConfig::useShapeInference"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21useTokenTypeEmbeddingEv", "tensorrt_llm::runtime::ModelConfig::useTokenTypeEmbedding"], [1, 1, 1, "_CPPv4I0EN12tensorrt_llm7runtime18PointerElementTypeE", "tensorrt_llm::runtime::PointerElementType"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime18PointerElementTypeE", "tensorrt_llm::runtime::PointerElementType::T"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParamsE", "tensorrt_llm::runtime::PromptTuningParams"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::PromptTuningParams::PromptTuningParams"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::PromptTuningParams::PromptTuningParams::embeddingTable"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::PromptTuningParams::PromptTuningParams::tasks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::PromptTuningParams::PromptTuningParams::vocabSize"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams10SizeType32E", "tensorrt_llm::runtime::PromptTuningParams::SizeType32"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams9TensorPtrE", "tensorrt_llm::runtime::PromptTuningParams::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::numContextRequests"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::packedInput"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::reqBeamWidths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::reqPromptLengths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::tasksHost"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngineE", "tensorrt_llm::runtime::RawEngine"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type15AddressWithSizeE", "tensorrt_llm::runtime::RawEngine::AddressWithSize"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type8FilePathE", "tensorrt_llm::runtime::RawEngine::FilePath"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type10HostMemoryE", "tensorrt_llm::runtime::RawEngine::HostMemory"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineENSt10filesystem4pathE", "tensorrt_llm::runtime::RawEngine::RawEngine"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKN8nvinfer111IHostMemoryE", "tensorrt_llm::runtime::RawEngine::RawEngine"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKvNSt6size_tE", "tensorrt_llm::runtime::RawEngine::RawEngine"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKvNSt6size_tE", "tensorrt_llm::runtime::RawEngine::RawEngine::engineAddr"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKN8nvinfer111IHostMemoryE", "tensorrt_llm::runtime::RawEngine::RawEngine::engineBuffer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineENSt10filesystem4pathE", "tensorrt_llm::runtime::RawEngine::RawEngine::enginePath"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKvNSt6size_tE", "tensorrt_llm::runtime::RawEngine::RawEngine::engineSize"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4TypeE", "tensorrt_llm::runtime::RawEngine::Type"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type15AddressWithSizeE", "tensorrt_llm::runtime::RawEngine::Type::AddressWithSize"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type8FilePathE", "tensorrt_llm::runtime::RawEngine::Type::FilePath"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type10HostMemoryE", "tensorrt_llm::runtime::RawEngine::Type::HostMemory"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine10getAddressEv", "tensorrt_llm::runtime::RawEngine::getAddress"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine13getHostMemoryEv", "tensorrt_llm::runtime::RawEngine::getHostMemory"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine23getManagedWeightsMapOptEv", "tensorrt_llm::runtime::RawEngine::getManagedWeightsMapOpt"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getPathEv", "tensorrt_llm::runtime::RawEngine::getPath"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine10getPathOptEv", "tensorrt_llm::runtime::RawEngine::getPathOpt"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getSizeEv", "tensorrt_llm::runtime::RawEngine::getSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getTypeEv", "tensorrt_llm::runtime::RawEngine::getType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEngineAddrE", "tensorrt_llm::runtime::RawEngine::mEngineAddr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine13mEngineBufferE", "tensorrt_llm::runtime::RawEngine::mEngineBuffer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEnginePathE", "tensorrt_llm::runtime::RawEngine::mEnginePath"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEngineSizeE", "tensorrt_llm::runtime::RawEngine::mEngineSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine18mManagedWeightsMapE", "tensorrt_llm::runtime::RawEngine::mManagedWeightsMap"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine5mTypeE", "tensorrt_llm::runtime::RawEngine::mType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine20setManagedWeightsMapENSt3mapINSt6stringEN12tensorrt_llm8executor6TensorEEE", "tensorrt_llm::runtime::RawEngine::setManagedWeightsMap"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine20setManagedWeightsMapENSt3mapINSt6stringEN12tensorrt_llm8executor6TensorEEE", "tensorrt_llm::runtime::RawEngine::setManagedWeightsMap::managedWeightsMap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine7setPathENSt10filesystem4pathE", "tensorrt_llm::runtime::RawEngine::setPath"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine7setPathENSt10filesystem4pathE", "tensorrt_llm::runtime::RawEngine::setPath::enginePath"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11RequestTypeE", "tensorrt_llm::runtime::RequestType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11RequestType8kCONTEXTE", "tensorrt_llm::runtime::RequestType::kCONTEXT"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11RequestType11kGENERATIONE", "tensorrt_llm::runtime::RequestType::kGENERATION"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaultsE", "tensorrt_llm::runtime::RuntimeDefaults"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::RuntimeDefaults::RuntimeDefaults"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsEv", "tensorrt_llm::runtime::RuntimeDefaults::RuntimeDefaults"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::RuntimeDefaults::RuntimeDefaults::maxAttentionWindowVec"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::RuntimeDefaults::RuntimeDefaults::sinkTokenLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults21maxAttentionWindowVecE", "tensorrt_llm::runtime::RuntimeDefaults::maxAttentionWindowVec"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15sinkTokenLengthE", "tensorrt_llm::runtime::RuntimeDefaults::sinkTokenLength"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfigE", "tensorrt_llm::runtime::SamplingConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9FloatTypeE", "tensorrt_llm::runtime::SamplingConfig::FloatType"], [1, 1, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig6OptVecE", "tensorrt_llm::runtime::SamplingConfig::OptVec"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig6OptVecE", "tensorrt_llm::runtime::SamplingConfig::OptVec::T"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigE10SizeType32", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKN8executor14SamplingConfigERKNSt8optionalIN8executor25ExternalDraftTokensConfigEEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKNSt6vectorI14SamplingConfigEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigE10SizeType32", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig::beamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKNSt6vectorI14SamplingConfigEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig::configs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKN8executor14SamplingConfigERKNSt8optionalIN8executor25ExternalDraftTokensConfigEEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig::externalDraftTokensConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKN8executor14SamplingConfigERKNSt8optionalIN8executor25ExternalDraftTokensConfigEEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig::samplingConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig23beamSearchDiversityRateE", "tensorrt_llm::runtime::SamplingConfig::beamSearchDiversityRate"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9beamWidthE", "tensorrt_llm::runtime::SamplingConfig::beamWidth"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14beamWidthArrayE", "tensorrt_llm::runtime::SamplingConfig::beamWidthArray"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig11cumLogProbsE", "tensorrt_llm::runtime::SamplingConfig::cumLogProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig24draftAcceptanceThresholdE", "tensorrt_llm::runtime::SamplingConfig::draftAcceptanceThreshold"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig13earlyStoppingE", "tensorrt_llm::runtime::SamplingConfig::earlyStopping"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig16frequencyPenaltyE", "tensorrt_llm::runtime::SamplingConfig::frequencyPenalty"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues::accessor"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues::configs"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues::defaultValue"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfig15getMaxBeamWidthEv", "tensorrt_llm::runtime::SamplingConfig::getMaxBeamWidth"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfig17getNumReturnBeamsEv", "tensorrt_llm::runtime::SamplingConfig::getNumReturnBeams"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig13lengthPenaltyE", "tensorrt_llm::runtime::SamplingConfig::lengthPenalty"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9minLengthE", "tensorrt_llm::runtime::SamplingConfig::minLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4minPE", "tensorrt_llm::runtime::SamplingConfig::minP"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17noRepeatNgramSizeE", "tensorrt_llm::runtime::SamplingConfig::noRepeatNgramSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17normalizeLogProbsE", "tensorrt_llm::runtime::SamplingConfig::normalizeLogProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig18numReturnSequencesE", "tensorrt_llm::runtime::SamplingConfig::numReturnSequences"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfigeqERK14SamplingConfig", "tensorrt_llm::runtime::SamplingConfig::operator=="], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfigeqERK14SamplingConfig", "tensorrt_llm::runtime::SamplingConfig::operator==::other"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig19originalTemperatureE", "tensorrt_llm::runtime::SamplingConfig::originalTemperature"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14outputLogProbsE", "tensorrt_llm::runtime::SamplingConfig::outputLogProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig15presencePenaltyE", "tensorrt_llm::runtime::SamplingConfig::presencePenalty"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig10randomSeedE", "tensorrt_llm::runtime::SamplingConfig::randomSeed"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17repetitionPenaltyE", "tensorrt_llm::runtime::SamplingConfig::repetitionPenalty"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig11temperatureE", "tensorrt_llm::runtime::SamplingConfig::temperature"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4topKE", "tensorrt_llm::runtime::SamplingConfig::topK"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig15topKMedusaHeadsE", "tensorrt_llm::runtime::SamplingConfig::topKMedusaHeads"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4topPE", "tensorrt_llm::runtime::SamplingConfig::topP"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9topPDecayE", "tensorrt_llm::runtime::SamplingConfig::topPDecay"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig7topPMinE", "tensorrt_llm::runtime::SamplingConfig::topPMin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig12topPResetIdsE", "tensorrt_llm::runtime::SamplingConfig::topPResetIds"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", "tensorrt_llm::runtime::SamplingConfig::useDefaultValues"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", "tensorrt_llm::runtime::SamplingConfig::useDefaultValues::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", "tensorrt_llm::runtime::SamplingConfig::useDefaultValues::defaultValue"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", "tensorrt_llm::runtime::SamplingConfig::useDefaultValues::vec"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig8validateEv", "tensorrt_llm::runtime::SamplingConfig::validate"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::max"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::min"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::name"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::vec"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10SizeType32E", "tensorrt_llm::runtime::SizeType32"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10SizeType64E", "tensorrt_llm::runtime::SizeType64"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingModeE", "tensorrt_llm::runtime::SpeculativeDecodingMode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode19DraftTokensExternalEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::DraftTokensExternal"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode5EagleEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::Eagle"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode19ExplicitDraftTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::ExplicitDraftTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode17LookaheadDecodingEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::LookaheadDecoding"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6MedusaEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::Medusa"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode4NoneEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::None"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode23SpeculativeDecodingModeE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::SpeculativeDecodingMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode23SpeculativeDecodingModeE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::SpeculativeDecodingMode::state"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode14UnderlyingTypeE", "tensorrt_llm::runtime::SpeculativeDecodingMode::UnderlyingType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9allBitSetE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::allBitSet"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9allBitSetE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::allBitSet::bits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9anyBitSetE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::anyBitSet"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9anyBitSetE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::anyBitSet::bits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode14hasDraftLogitsEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::hasDraftLogits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21isDraftTokensExternalEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isDraftTokensExternal"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode7isEagleEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isEagle"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21isExplicitDraftTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isExplicitDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19isLookaheadDecodingEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isLookaheadDecoding"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode8isMedusaEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isMedusa"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode6isNoneEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isNone"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode20kDraftTokensExternalE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kDraftTokensExternal"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6kEagleE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kEagle"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode20kExplicitDraftTokensE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kExplicitDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode18kLookaheadDecodingE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kLookaheadDecoding"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode7kMedusaE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kMedusa"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode5kNoneE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kNone"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6mStateE", "tensorrt_llm::runtime::SpeculativeDecodingMode::mState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode20needsDecoderPrologueEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::needsDecoderPrologue"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode18needsKVCacheRewindEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::needsKVCacheRewind"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingModeeqERK23SpeculativeDecodingMode", "tensorrt_llm::runtime::SpeculativeDecodingMode::operator=="], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingModeeqERK23SpeculativeDecodingMode", "tensorrt_llm::runtime::SpeculativeDecodingMode::operator==::other"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19predictsDraftTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::predictsDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21requiresAttentionMaskEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::requiresAttentionMask"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode18updatesPositionIdsEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::updatesPositionIds"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19variableDraftLengthEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::variableDraftLength"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleE", "tensorrt_llm::runtime::SpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleERK25SpeculativeDecodingModule", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule::maxDecodingDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule::maxDraftPathLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule::maxNumPaths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleERK25SpeculativeDecodingModule", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule::o"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule21computeNumPackedMasksEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::computeNumPackedMasks"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule25getMaxDecodingDraftTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxDecodingDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule20getMaxDecodingTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxDecodingTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule18getMaxDraftPathLenEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxDraftPathLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule14getMaxNumPathsEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxNumPaths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule13getMaxPathLenEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxPathLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule17getNumPackedMasksEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getNumPackedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule23mMaxDecodingDraftTokensE", "tensorrt_llm::runtime::SpeculativeDecodingModule::mMaxDecodingDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule16mMaxDraftPathLenE", "tensorrt_llm::runtime::SpeculativeDecodingModule::mMaxDraftPathLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18mMaxNumPackedMasksE", "tensorrt_llm::runtime::SpeculativeDecodingModule::mMaxNumPackedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule12mMaxNumPathsE", "tensorrt_llm::runtime::SpeculativeDecodingModule::mMaxNumPaths"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleaSERK25SpeculativeDecodingModule", "tensorrt_llm::runtime::SpeculativeDecodingModule::operator="], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleaSERK25SpeculativeDecodingModule", "tensorrt_llm::runtime::SpeculativeDecodingModule::operator=::o"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18setMaxDraftPathLenE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxDraftPathLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18setMaxDraftPathLenE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxDraftPathLen::maxDraftPathLen"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule17setMaxDraftTokensE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule17setMaxDraftTokensE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxDraftTokens::maxDraftTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule14setMaxNumPathsE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxNumPaths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule14setMaxNumPathsE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxNumPaths::maxNumPaths"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleD0Ev", "tensorrt_llm::runtime::SpeculativeDecodingModule::~SpeculativeDecodingModule"], [1, 1, 1, "_CPPv4I0EN12tensorrt_llm7runtime12StringPtrMapE", "tensorrt_llm::runtime::StringPtrMap"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime12StringPtrMapE", "tensorrt_llm::runtime::StringPtrMap::T"], [1, 2, 1, "_CPPv4I0_bEN12tensorrt_llm7runtime11TRTDataTypeE", "tensorrt_llm::runtime::TRTDataType"], [1, 8, 1, "_CPPv4I0_bEN12tensorrt_llm7runtime11TRTDataTypeE", "tensorrt_llm::runtime::TRTDataType::T"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime11TRTDataTypeIP1TEE", "tensorrt_llm::runtime::TRTDataType&lt;T*&gt;"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime11TRTDataTypeIP1TEE", "tensorrt_llm::runtime::TRTDataType&lt;T*&gt;::T"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIP1TE15kUnderlyingTypeE", "tensorrt_llm::runtime::TRTDataType&lt;T*&gt;::kUnderlyingType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIP1TE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;T*&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIbEE", "tensorrt_llm::runtime::TRTDataType&lt;bool&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIbE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;bool&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIfEE", "tensorrt_llm::runtime::TRTDataType&lt;float&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIfE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;float&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeI4halfEE", "tensorrt_llm::runtime::TRTDataType&lt;half&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeI4halfE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;half&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7kernels13FinishedStateEEE", "tensorrt_llm::runtime::TRTDataType&lt;kernels::FinishedState&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7kernels13FinishedStateEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;kernels::FinishedState&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7kernels12KVCacheIndexEEE", "tensorrt_llm::runtime::TRTDataType&lt;kernels::KVCacheIndex&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7kernels12KVCacheIndexEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;kernels::KVCacheIndex&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7runtime11RequestTypeEEE", "tensorrt_llm::runtime::TRTDataType&lt;runtime::RequestType&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7runtime11RequestTypeEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;runtime::RequestType&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7int32_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::int32_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7int32_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::int32_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7int64_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::int64_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7int64_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::int64_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt6int8_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::int8_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt6int8_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::int8_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt8uint32_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint32_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt8uint32_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint32_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt8uint64_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint64_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt8uint64_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint64_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7uint8_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint8_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7uint8_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint8_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIPvEE", "tensorrt_llm::runtime::TRTDataType&lt;void*&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIPvE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;void*&gt;::value"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLoggerE", "tensorrt_llm::runtime::TllmLogger"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8getLevelEv", "tensorrt_llm::runtime::TllmLogger::getLevel"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger3logE8SeverityPKN8nvinfer19AsciiCharE", "tensorrt_llm::runtime::TllmLogger::log"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger3logE8SeverityPKN8nvinfer19AsciiCharE", "tensorrt_llm::runtime::TllmLogger::log::msg"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger3logE8SeverityPKN8nvinfer19AsciiCharE", "tensorrt_llm::runtime::TllmLogger::log::severity"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8setLevelE8Severity", "tensorrt_llm::runtime::TllmLogger::setLevel"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8setLevelE8Severity", "tensorrt_llm::runtime::TllmLogger::setLevel::level"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime16TokenExtraIdTypeE", "tensorrt_llm::runtime::TokenExtraIdType"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime11TokenIdTypeE", "tensorrt_llm::runtime::TokenIdType"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11UniqueTokenE", "tensorrt_llm::runtime::UniqueToken"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11UniqueTokeneqERK11UniqueToken", "tensorrt_llm::runtime::UniqueToken::operator=="], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11UniqueTokeneqERK11UniqueToken", "tensorrt_llm::runtime::UniqueToken::operator==::other"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11UniqueToken12tokenExtraIdE", "tensorrt_llm::runtime::UniqueToken::tokenExtraId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11UniqueToken7tokenIdE", "tensorrt_llm::runtime::UniqueToken::tokenId"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime16VecTokenExtraIdsE", "tensorrt_llm::runtime::VecTokenExtraIds"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime15VecUniqueTokensE", "tensorrt_llm::runtime::VecUniqueTokens"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfigE", "tensorrt_llm::runtime::WorldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::contextParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::deviceIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::enableAttentionDP"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::gpusPerNode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::pipelineParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::rank"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::tensorParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig17enableAttentionDPEv", "tensorrt_llm::runtime::WorldConfig::enableAttentionDP"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig23getContextParallelGroupEv", "tensorrt_llm::runtime::WorldConfig::getContextParallelGroup"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getContextParallelRankEv", "tensorrt_llm::runtime::WorldConfig::getContextParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig21getContextParallelismEv", "tensorrt_llm::runtime::WorldConfig::getContextParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig9getDeviceEv", "tensorrt_llm::runtime::WorldConfig::getDevice"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getDeviceOfE10SizeType32", "tensorrt_llm::runtime::WorldConfig::getDeviceOf"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getDeviceOfE10SizeType32", "tensorrt_llm::runtime::WorldConfig::getDeviceOf::rank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig15getGpusPerGroupEv", "tensorrt_llm::runtime::WorldConfig::getGpusPerGroup"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig14getGpusPerNodeEv", "tensorrt_llm::runtime::WorldConfig::getGpusPerNode"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getLastRankEv", "tensorrt_llm::runtime::WorldConfig::getLastRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig12getLocalRankEv", "tensorrt_llm::runtime::WorldConfig::getLocalRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getNodeRankEv", "tensorrt_llm::runtime::WorldConfig::getNodeRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig13getNodeRankOfE10SizeType32", "tensorrt_llm::runtime::WorldConfig::getNodeRankOf"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig13getNodeRankOfE10SizeType32", "tensorrt_llm::runtime::WorldConfig::getNodeRankOf::rank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig24getPipelineParallelGroupEv", "tensorrt_llm::runtime::WorldConfig::getPipelineParallelGroup"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig23getPipelineParallelRankEv", "tensorrt_llm::runtime::WorldConfig::getPipelineParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getPipelineParallelismEv", "tensorrt_llm::runtime::WorldConfig::getPipelineParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig7getRankEv", "tensorrt_llm::runtime::WorldConfig::getRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig7getSizeEv", "tensorrt_llm::runtime::WorldConfig::getSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getTensorParallelGroupEv", "tensorrt_llm::runtime::WorldConfig::getTensorParallelGroup"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig21getTensorParallelRankEv", "tensorrt_llm::runtime::WorldConfig::getTensorParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig20getTensorParallelismEv", "tensorrt_llm::runtime::WorldConfig::getTensorParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig17isContextParallelEv", "tensorrt_llm::runtime::WorldConfig::isContextParallel"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig26isFirstContextParallelRankEv", "tensorrt_llm::runtime::WorldConfig::isFirstContextParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig27isFirstPipelineParallelRankEv", "tensorrt_llm::runtime::WorldConfig::isFirstPipelineParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig25isFirstTensorParallelRankEv", "tensorrt_llm::runtime::WorldConfig::isFirstTensorParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig26isLastPipelineParallelRankEv", "tensorrt_llm::runtime::WorldConfig::isLastPipelineParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig18isPipelineParallelEv", "tensorrt_llm::runtime::WorldConfig::isPipelineParallel"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig16isTensorParallelEv", "tensorrt_llm::runtime::WorldConfig::isTensorParallel"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig19kDefaultGpusPerNodeE", "tensorrt_llm::runtime::WorldConfig::kDefaultGpusPerNode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig19mContextParallelismE", "tensorrt_llm::runtime::WorldConfig::mContextParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig10mDeviceIdsE", "tensorrt_llm::runtime::WorldConfig::mDeviceIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig18mEnableAttentionDPE", "tensorrt_llm::runtime::WorldConfig::mEnableAttentionDP"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig12mGpusPerNodeE", "tensorrt_llm::runtime::WorldConfig::mGpusPerNode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig20mPipelineParallelismE", "tensorrt_llm::runtime::WorldConfig::mPipelineParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig5mRankE", "tensorrt_llm::runtime::WorldConfig::mRank"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig18mTensorParallelismE", "tensorrt_llm::runtime::WorldConfig::mTensorParallelism"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::contextParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::deviceIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::enableAttentionDP"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::gpusPerNode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::pipelineParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::tensorParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig14validMpiConfigEv", "tensorrt_llm::runtime::WorldConfig::validMpiConfig"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEP1TR7IBuffer", "tensorrt_llm::runtime::bufferCast"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEPK1TRK7IBuffer", "tensorrt_llm::runtime::bufferCast"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEP1TR7IBuffer", "tensorrt_llm::runtime::bufferCast::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEPK1TRK7IBuffer", "tensorrt_llm::runtime::bufferCast::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEP1TR7IBuffer", "tensorrt_llm::runtime::bufferCast::buffer"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEPK1TRK7IBuffer", "tensorrt_llm::runtime::bufferCast::buffer"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7IBuffer9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7ITensor9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7IBuffer9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7ITensor9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7IBuffer14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7ITensor14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7IBuffer14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7ITensor14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7IBuffer9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7ITensor9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7IBuffer9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7ITensor9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7IBuffer14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7ITensor14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7IBuffer14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7ITensor14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7IBuffer9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull::bufferPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7IBuffer14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull::bufferPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7IBuffer9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::optionalBufferPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7IBuffer14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::optionalBufferPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7ITensor9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::optionalTensorPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7ITensor14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::optionalTensorPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7ITensor9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull::tensorPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7ITensor14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull::tensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13canAccessPeerERK11WorldConfig", "tensorrt_llm::runtime::canAccessPeer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13canAccessPeerERK11WorldConfig", "tensorrt_llm::runtime::canAccessPeer::worldConfig"], [1, 3, 1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", "tensorrt_llm::runtime::constPointerCast"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERKNSt10shared_ptrI1TEE", "tensorrt_llm::runtime::constPointerCast"], [1, 8, 1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", "tensorrt_llm::runtime::constPointerCast::D"], [1, 8, 1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", "tensorrt_llm::runtime::constPointerCast::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERKNSt10shared_ptrI1TEE", "tensorrt_llm::runtime::constPointerCast::T"], [1, 4, 1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", "tensorrt_llm::runtime::constPointerCast::ptr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERKNSt10shared_ptrI1TEE", "tensorrt_llm::runtime::constPointerCast::ptr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoderE", "tensorrt_llm::runtime::decoder"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoderE", "tensorrt_llm::runtime::decoder"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffersE", "tensorrt_llm::runtime::decoder::BeamSearchBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers17BeamSearchBuffersERK13BufferManager", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::BeamSearchBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers17BeamSearchBuffersERK13BufferManager", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::BeamSearchBuffers::bufferManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers15mCumLogProbsTmpE", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::mCumLogProbsTmp"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7mNumSMsE", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::mNumSMs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers21mOutputBeamHypothesesE", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::mOutputBeamHypotheses"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7reshapeE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7reshapeE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::reshape::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7reshapeE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::reshape::maxSequenceLength"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE", "tensorrt_llm::runtime::decoder::DecoderState"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState12DecoderStateEN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::DecoderState"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState12DecoderStateEN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::DecoderState::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState12DecoderStateEN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::DecoderState::dtype"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16DecodingInputPtrE", "tensorrt_llm::runtime::decoder::DecoderState::DecodingInputPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState17DecodingOutputPtrE", "tensorrt_llm::runtime::decoder::DecoderState::DecodingOutputPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13LlmRequestPtrE", "tensorrt_llm::runtime::decoder::DecoderState::LlmRequestPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13RequestVectorE", "tensorrt_llm::runtime::decoder::DecoderState::RequestVector"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState9TensorPtrE", "tensorrt_llm::runtime::decoder::DecoderState::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::allocateSpeculativeDecodingBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::allocateSpeculativeDecodingBuffers::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::allocateSpeculativeDecodingBuffers::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::allocateSpeculativeDecodingBuffers::speculativeDecodingMode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16disableLookaheadERK13RequestVector", "tensorrt_llm::runtime::decoder::DecoderState::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16disableLookaheadERK13RequestVector", "tensorrt_llm::runtime::decoder::DecoderState::disableLookahead::genRequests"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24getAcceptedLengthsCumSumEv", "tensorrt_llm::runtime::decoder::DecoderState::getAcceptedLengthsCumSum"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getAcceptedPackedPathsEv", "tensorrt_llm::runtime::decoder::DecoderState::getAcceptedPackedPaths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getAllNewTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getBeamSearchBuffersEv", "tensorrt_llm::runtime::decoder::DecoderState::getBeamSearchBuffers"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getCumLogProbs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsEv", "tensorrt_llm::runtime::decoder::DecoderState::getCumLogProbs"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getCumLogProbs::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishReasonsEv", "tensorrt_llm::runtime::decoder::DecoderState::getFinishReasons"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishedStepsEv", "tensorrt_llm::runtime::decoder::DecoderState::getFinishedSteps"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getFinishedSumEv", "tensorrt_llm::runtime::decoder::DecoderState::getFinishedSum"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getGatheredIds"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsEv", "tensorrt_llm::runtime::decoder::DecoderState::getGatheredIds"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getGatheredIds::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getIds"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsEv", "tensorrt_llm::runtime::decoder::DecoderState::getIds"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getIds::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState21getJointDecodingInputEv", "tensorrt_llm::runtime::decoder::DecoderState::getJointDecodingInput"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getJointDecodingOutputEv", "tensorrt_llm::runtime::decoder::DecoderState::getJointDecodingOutput"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getLogProbs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsEv", "tensorrt_llm::runtime::decoder::DecoderState::getLogProbs"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getLogProbs::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv", "tensorrt_llm::runtime::decoder::DecoderState::getMaxBeamWidth"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState27getMaxDecodingDecoderTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getMaxDecodingDecoderTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getMaxDecodingEngineTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getMaxDecodingEngineTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getMaxSequenceLengthEv", "tensorrt_llm::runtime::decoder::DecoderState::getMaxSequenceLength"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getNextDraftTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getNextDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getNextDraftTokensLengthsEv", "tensorrt_llm::runtime::decoder::DecoderState::getNextDraftTokensLengths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getNumDecodingEngineTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getNumDecodingEngineTokens"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getNumDecodingEngineTokens::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState12getParentIdsEv", "tensorrt_llm::runtime::decoder::DecoderState::getParentIds"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getPrevDraftTokensLengthsEv", "tensorrt_llm::runtime::decoder::DecoderState::getPrevDraftTokensLengths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsEv", "tensorrt_llm::runtime::decoder::DecoderState::getSequenceLengths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getSpeculativeDecodingModeEv", "tensorrt_llm::runtime::decoder::DecoderState::getSpeculativeDecodingMode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState18mBeamSearchBuffersE", "tensorrt_llm::runtime::decoder::DecoderState::mBeamSearchBuffers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState14mFinishedStepsE", "tensorrt_llm::runtime::decoder::DecoderState::mFinishedSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState19mJointDecodingInputE", "tensorrt_llm::runtime::decoder::DecoderState::mJointDecodingInput"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState20mJointDecodingOutputE", "tensorrt_llm::runtime::decoder::DecoderState::mJointDecodingOutput"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13mMaxBatchSizeE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxBatchSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13mMaxBeamWidthE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxBeamWidth"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState25mMaxDecodingDecoderTokensE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxDecodingDecoderTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mMaxDecodingEngineTokensE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxDecodingEngineTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState18mMaxSequenceLengthE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxSequenceLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mNumDecodingEngineTokensE", "tensorrt_llm::runtime::decoder::DecoderState::mNumDecodingEngineTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mSpeculativeDecodingModeE", "tensorrt_llm::runtime::decoder::DecoderState::mSpeculativeDecodingMode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState26setNumDecodingEngineTokensE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::setNumDecodingEngineTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState26setNumDecodingEngineTokensE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::setNumDecodingEngineTokens::batchIdx"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState26setNumDecodingEngineTokensE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::setNumDecodingEngineTokens::numTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::maxAttentionWindow"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::sinkTokenLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::worldConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState10setupEagleEN12EagleBuffers6InputsE", "tensorrt_llm::runtime::decoder::DecoderState::setupEagle"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState10setupEagleEN12EagleBuffers6InputsE", "tensorrt_llm::runtime::decoder::DecoderState::setupEagle::eagleBuffers"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24setupExplicitDraftTokensEN26ExplicitDraftTokensBuffers6InputsE", "tensorrt_llm::runtime::decoder::DecoderState::setupExplicitDraftTokens"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24setupExplicitDraftTokensEN26ExplicitDraftTokensBuffers6InputsE", "tensorrt_llm::runtime::decoder::DecoderState::setupExplicitDraftTokens::explicitDraftTokensBuffers"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14setupLookaheadE24LookaheadDecodingBuffers", "tensorrt_llm::runtime::decoder::DecoderState::setupLookahead"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14setupLookaheadE24LookaheadDecodingBuffers", "tensorrt_llm::runtime::decoder::DecoderState::setupLookahead::lookaheadDecodingBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::maxTokensPerEngineStep"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::speculativeDecodingMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batchE", "tensorrt_llm::runtime::decoder_batch"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batchE", "tensorrt_llm::runtime::decoder_batch"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE", "tensorrt_llm::runtime::decoder_batch::Input"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorI14TensorConstPtrEE", "tensorrt_llm::runtime::decoder_batch::Input::Input"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorINSt6vectorI14TensorConstPtrEEEE10SizeType32", "tensorrt_llm::runtime::decoder_batch::Input::Input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorI14TensorConstPtrEE", "tensorrt_llm::runtime::decoder_batch::Input::Input::logits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorINSt6vectorI14TensorConstPtrEEEE10SizeType32", "tensorrt_llm::runtime::decoder_batch::Input::Input::logits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorINSt6vectorI14TensorConstPtrEEEE10SizeType32", "tensorrt_llm::runtime::decoder_batch::Input::Input::maxDecoderSteps"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input14TensorConstPtrE", "tensorrt_llm::runtime::decoder_batch::Input::TensorConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input9TensorPtrE", "tensorrt_llm::runtime::decoder_batch::Input::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input10batchSlotsE", "tensorrt_llm::runtime::decoder_batch::Input::batchSlots"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input22batchSlotsRequestOrderE", "tensorrt_llm::runtime::decoder_batch::Input::batchSlotsRequestOrder"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input16cacheIndirectionE", "tensorrt_llm::runtime::decoder_batch::Input::cacheIndirection"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input11eagleInputsE", "tensorrt_llm::runtime::decoder_batch::Input::eagleInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15eagleLastInputsE", "tensorrt_llm::runtime::decoder_batch::Input::eagleLastInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input25explicitDraftTokensInputsE", "tensorrt_llm::runtime::decoder_batch::Input::explicitDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input29explicitDraftTokensLastInputsE", "tensorrt_llm::runtime::decoder_batch::Input::explicitDraftTokensLastInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15generationStepsE", "tensorrt_llm::runtime::decoder_batch::Input::generationSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input6logitsE", "tensorrt_llm::runtime::decoder_batch::Input::logits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15maxDecoderStepsE", "tensorrt_llm::runtime::decoder_batch::Input::maxDecoderSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input20predictedDraftLogitsE", "tensorrt_llm::runtime::decoder_batch::Input::predictedDraftLogits"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE", "tensorrt_llm::runtime::decoder_batch::Output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output6OutputEv", "tensorrt_llm::runtime::decoder_batch::Output::Output"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output9TensorPtrE", "tensorrt_llm::runtime::decoder_batch::Output::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output16cacheIndirectionE", "tensorrt_llm::runtime::decoder_batch::Output::cacheIndirection"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7RequestE", "tensorrt_llm::runtime::decoder_batch::Request"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request9BufferPtrE", "tensorrt_llm::runtime::decoder_batch::Request::BufferPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request::endId"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request::ids"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request::inputLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request::maxNewTokens"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request14TensorConstPtrE", "tensorrt_llm::runtime::decoder_batch::Request::TensorConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request9TensorPtrE", "tensorrt_llm::runtime::decoder_batch::Request::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request12badWordsListE", "tensorrt_llm::runtime::decoder_batch::Request::badWordsList"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11draftLogitsE", "tensorrt_llm::runtime::decoder_batch::Request::draftLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11draftTokensE", "tensorrt_llm::runtime::decoder_batch::Request::draftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request5dtypeE", "tensorrt_llm::runtime::decoder_batch::Request::dtype"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11eagleConfigE", "tensorrt_llm::runtime::decoder_batch::Request::eagleConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13embeddingBiasE", "tensorrt_llm::runtime::decoder_batch::Request::embeddingBias"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request5endIdE", "tensorrt_llm::runtime::decoder_batch::Request::endId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request28generatedTokensPerEngineStepE", "tensorrt_llm::runtime::decoder_batch::Request::generatedTokensPerEngineStep"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request3idsE", "tensorrt_llm::runtime::decoder_batch::Request::ids"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request8inputLenE", "tensorrt_llm::runtime::decoder_batch::Request::inputLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request22lookaheadRuntimeConfigE", "tensorrt_llm::runtime::decoder_batch::Request::lookaheadRuntimeConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request12maxNewTokensE", "tensorrt_llm::runtime::decoder_batch::Request::maxNewTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11medusaPathsE", "tensorrt_llm::runtime::decoder_batch::Request::medusaPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13medusaTreeIdsE", "tensorrt_llm::runtime::decoder_batch::Request::medusaTreeIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13stopWordsListE", "tensorrt_llm::runtime::decoder_batch::Request::stopWordsList"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20getDefaultBatchSlotsEN7runtime10SizeType32E", "tensorrt_llm::runtime::getDefaultBatchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20getDefaultBatchSlotsEN7runtime10SizeType32E", "tensorrt_llm::runtime::getDefaultBatchSlots::batchSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15ipcNvlsAllocateE6size_tNSt3setIiEE", "tensorrt_llm::runtime::ipcNvlsAllocate"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15ipcNvlsAllocateE6size_tNSt3setIiEE", "tensorrt_llm::runtime::ipcNvlsAllocate::ranks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15ipcNvlsAllocateE6size_tNSt3setIiEE", "tensorrt_llm::runtime::ipcNvlsAllocate::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ipcNvlsFreeEP13IpcNvlsHandle", "tensorrt_llm::runtime::ipcNvlsFree"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ipcNvlsFreeEP13IpcNvlsHandle", "tensorrt_llm::runtime::ipcNvlsFree::handle"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime16ipcNvlsSupportedEv", "tensorrt_llm::runtime::ipcNvlsSupported"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll::buffer_0"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll::buffer_1"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll::buffer_2"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK10LoraModule", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7IBuffer", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7ITensor", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN7ITensor5ShapeE", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7IBuffer", "tensorrt_llm::runtime::operator&lt;&lt;::buffer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::operator&lt;&lt;::c"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN7ITensor5ShapeE", "tensorrt_llm::runtime::operator&lt;&lt;::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK10LoraModule", "tensorrt_llm::runtime::operator&lt;&lt;::module"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::operator&lt;&lt;::os"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::operator&lt;&lt;::os"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK10LoraModule", "tensorrt_llm::runtime::operator&lt;&lt;::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7IBuffer", "tensorrt_llm::runtime::operator&lt;&lt;::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7ITensor", "tensorrt_llm::runtime::operator&lt;&lt;::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN7ITensor5ShapeE", "tensorrt_llm::runtime::operator&lt;&lt;::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7ITensor", "tensorrt_llm::runtime::operator&lt;&lt;::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::operator&lt;&lt;::v"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9to_stringERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::to_string"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9to_stringERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::to_string"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9to_stringERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::to_string::c"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9to_stringERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::to_string::v"], [82, 9, 0, "-", "tensorrt_llm"]], "tensorrt_llm": [[77, 9, 0, "-", "functional"], [79, 9, 0, "-", "models"], [80, 9, 0, "-", "plugin"], [81, 9, 0, "-", "quantization"], [82, 9, 0, "-", "runtime"]], "tensorrt_llm.functional": [[77, 10, 1, "", "AllReduceFusionOp"], [77, 10, 1, "", "AllReduceParams"], [77, 10, 1, "", "AllReduceStrategy"], [77, 10, 1, "", "AttentionMaskType"], [77, 10, 1, "", "Conditional"], [77, 10, 1, "", "DimRange"], [77, 10, 1, "", "LayerNormPositionType"], [77, 10, 1, "", "LayerNormType"], [77, 10, 1, "", "MLPType"], [77, 10, 1, "", "PositionEmbeddingType"], [77, 10, 1, "", "RopeEmbeddingUtils"], [77, 10, 1, "", "RotaryScalingType"], [77, 10, 1, "", "SideStreamIDType"], [77, 10, 1, "", "SliceInputType"], [77, 10, 1, "", "Tensor"], [77, 14, 1, "", "abs"], [77, 14, 1, "", "activation"], [77, 14, 1, "", "add"], [77, 14, 1, "", "allgather"], [77, 14, 1, "", "allreduce"], [77, 14, 1, "", "arange"], [77, 14, 1, "", "argmax"], [77, 14, 1, "", "assertion"], [77, 14, 1, "", "avg_pool2d"], [77, 14, 1, "", "bert_attention"], [77, 14, 1, "", "broadcast_helper"], [77, 14, 1, "", "cast"], [77, 14, 1, "", "categorical_sample"], [77, 14, 1, "", "chunk"], [77, 14, 1, "", "clip"], [77, 14, 1, "", "concat"], [77, 14, 1, "", "constant"], [77, 14, 1, "", "constant_to_tensor_"], [77, 14, 1, "", "constants_to_tensors_"], [77, 14, 1, "", "conv1d"], [77, 14, 1, "", "conv2d"], [77, 14, 1, "", "conv3d"], [77, 14, 1, "", "conv_transpose2d"], [77, 14, 1, "", "cos"], [77, 14, 1, "", "cp_split_plugin"], [77, 14, 1, "", "create_allreduce_plugin"], [77, 14, 1, "", "cuda_stream_sync"], [77, 14, 1, "", "cumsum"], [77, 14, 1, "", "div"], [77, 14, 1, "", "dora_plugin"], [77, 14, 1, "", "einsum"], [77, 14, 1, "", "elementwise_binary"], [77, 14, 1, "", "embedding"], [77, 14, 1, "", "eq"], [77, 14, 1, "", "exp"], [77, 14, 1, "", "expand"], [77, 14, 1, "", "expand_dims"], [77, 14, 1, "", "expand_dims_like"], [77, 14, 1, "", "expand_mask"], [77, 14, 1, "", "flatten"], [77, 14, 1, "", "flip"], [77, 14, 1, "", "floordiv"], [77, 14, 1, "", "gather"], [77, 14, 1, "", "gather_last_token_logits"], [77, 14, 1, "", "gather_nd"], [77, 14, 1, "", "gegelu"], [77, 14, 1, "", "geglu"], [77, 14, 1, "", "gelu"], [77, 14, 1, "", "gemm_allreduce"], [77, 14, 1, "", "gemm_swiglu"], [77, 14, 1, "", "generate_alibi_biases"], [77, 14, 1, "", "generate_alibi_slopes"], [77, 14, 1, "", "generate_logn_scaling"], [77, 14, 1, "", "gpt_attention"], [77, 14, 1, "", "group_norm"], [77, 14, 1, "", "gt"], [77, 14, 1, "", "identity"], [77, 14, 1, "", "index_select"], [77, 14, 1, "", "int_clip"], [77, 14, 1, "", "interpolate"], [77, 14, 1, "", "is_gated_activation"], [77, 14, 1, "", "layer_norm"], [77, 14, 1, "", "log"], [77, 14, 1, "", "log_softmax"], [77, 14, 1, "", "lora_plugin"], [77, 14, 1, "", "low_latency_gemm"], [77, 14, 1, "", "low_latency_gemm_swiglu"], [77, 14, 1, "", "lt"], [77, 14, 1, "", "mamba_conv1d"], [77, 14, 1, "", "masked_scatter"], [77, 14, 1, "", "masked_select"], [77, 14, 1, "", "matmul"], [77, 14, 1, "", "max"], [77, 14, 1, "", "maximum"], [77, 14, 1, "", "mean"], [77, 14, 1, "", "meshgrid2d"], [77, 14, 1, "", "min"], [77, 14, 1, "", "minimum"], [77, 14, 1, "", "modulo"], [77, 14, 1, "", "mul"], [77, 14, 1, "", "non_gated_version"], [77, 14, 1, "", "nonzero"], [77, 14, 1, "", "not_op"], [77, 14, 1, "", "op_and"], [77, 14, 1, "", "op_or"], [77, 14, 1, "", "op_xor"], [77, 14, 1, "", "outer"], [77, 14, 1, "", "pad"], [77, 14, 1, "", "permute"], [77, 14, 1, "", "pow"], [77, 14, 1, "", "prod"], [77, 14, 1, "", "quick_gelu"], [77, 14, 1, "", "rand"], [77, 14, 1, "", "rearrange"], [77, 14, 1, "", "recv"], [77, 14, 1, "", "reduce"], [77, 14, 1, "", "reduce_scatter"], [77, 14, 1, "", "relu"], [77, 14, 1, "", "repeat"], [77, 14, 1, "", "repeat_interleave"], [77, 14, 1, "", "rg_lru"], [77, 14, 1, "", "rms_norm"], [77, 14, 1, "", "round"], [77, 14, 1, "", "scatter"], [77, 14, 1, "", "scatter_nd"], [77, 14, 1, "", "select"], [77, 14, 1, "", "selective_scan"], [77, 14, 1, "", "send"], [77, 14, 1, "", "shape"], [77, 14, 1, "", "sigmoid"], [77, 14, 1, "", "silu"], [77, 14, 1, "", "sin"], [77, 14, 1, "", "slice"], [77, 14, 1, "", "softmax"], [77, 14, 1, "", "softplus"], [77, 14, 1, "", "split"], [77, 14, 1, "", "sqrt"], [77, 14, 1, "", "squared_relu"], [77, 14, 1, "", "squeeze"], [77, 14, 1, "", "stack"], [77, 14, 1, "", "sub"], [77, 14, 1, "", "sum"], [77, 14, 1, "", "swiglu"], [77, 14, 1, "", "tanh"], [77, 14, 1, "", "topk"], [77, 14, 1, "", "transpose"], [77, 14, 1, "", "unary"], [77, 14, 1, "", "unbind"], [77, 14, 1, "", "unsqueeze"], [77, 14, 1, "", "view"], [77, 14, 1, "", "where"]], "tensorrt_llm.functional.AllReduceFusionOp": [[77, 11, 1, "", "LAST_PROCESS_FOR_UB"], [77, 11, 1, "", "MOE_ALLREDUCE_RESIDUAL_RMS_NORM"], [77, 11, 1, "", "NONE"], [77, 11, 1, "", "RESIDUAL_RMS_NORM"], [77, 11, 1, "", "RESIDUAL_RMS_NORM_OUT_QUANT_FP8"], [77, 11, 1, "", "RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4"], [77, 11, 1, "", "RESIDUAL_RMS_NORM_QUANT_FP8"], [77, 11, 1, "", "RESIDUAL_RMS_NORM_QUANT_NVFP4"], [77, 11, 1, "", "RESIDUAL_RMS_PREPOST_NORM"]], "tensorrt_llm.functional.AllReduceParams": [[77, 12, 1, "", "has_affine"], [77, 12, 1, "", "has_bias"], [77, 12, 1, "", "has_scale"], [77, 12, 1, "", "update_strategy"]], "tensorrt_llm.functional.AllReduceStrategy": [[77, 11, 1, "", "AUTO"], [77, 11, 1, "", "MIN_LATENCY"], [77, 11, 1, "", "NCCL"], [77, 11, 1, "", "ONESHOT"], [77, 11, 1, "", "TWOSHOT"], [77, 11, 1, "", "UB"]], "tensorrt_llm.functional.AttentionMaskType": [[77, 11, 1, "", "bidirectional"], [77, 11, 1, "", "bidirectionalglm"], [77, 11, 1, "", "blocksparse"], [77, 11, 1, "", "causal"], [77, 11, 1, "", "custom_mask"], [77, 11, 1, "", "padding"], [77, 11, 1, "", "sliding_window_causal"]], "tensorrt_llm.functional.Conditional": [[77, 12, 1, "", "add_input"], [77, 12, 1, "", "add_output"]], "tensorrt_llm.functional.LayerNormPositionType": [[77, 11, 1, "", "post_layernorm"], [77, 11, 1, "", "pre_layernorm"]], "tensorrt_llm.functional.LayerNormType": [[77, 11, 1, "", "GroupNorm"], [77, 11, 1, "", "LayerNorm"], [77, 11, 1, "", "RmsNorm"]], "tensorrt_llm.functional.MLPType": [[77, 11, 1, "", "FusedGatedMLP"], [77, 11, 1, "", "GatedMLP"], [77, 11, 1, "", "MLP"]], "tensorrt_llm.functional.PositionEmbeddingType": [[77, 11, 1, "", "alibi"], [77, 11, 1, "", "alibi_with_scale"], [77, 11, 1, "", "chatglm"], [77, 12, 1, "", "choices"], [77, 11, 1, "", "deferred"], [77, 12, 1, "", "from_string"], [77, 12, 1, "", "is_alibi"], [77, 12, 1, "", "is_deferred"], [77, 12, 1, "", "is_mrope"], [77, 12, 1, "", "is_rope"], [77, 11, 1, "", "learned_absolute"], [77, 11, 1, "", "long_rope"], [77, 11, 1, "", "mrope"], [77, 11, 1, "", "relative"], [77, 11, 1, "", "rope_gpt_neox"], [77, 11, 1, "", "rope_gptj"], [77, 11, 1, "", "yarn"]], "tensorrt_llm.functional.RopeEmbeddingUtils": [[77, 12, 1, "", "apply_llama3_scaling"], [77, 12, 1, "", "apply_rotary_pos_emb"], [77, 12, 1, "", "apply_rotary_pos_emb_chatglm"], [77, 12, 1, "", "apply_rotary_pos_emb_cogvlm"], [77, 12, 1, "", "create_fake_weight"], [77, 12, 1, "", "create_sinusoidal_positions"], [77, 12, 1, "", "create_sinusoidal_positions_for_attention_plugin"], [77, 12, 1, "", "create_sinusoidal_positions_for_cogvlm_attention_plugin"], [77, 12, 1, "", "create_sinusoidal_positions_long_rope"], [77, 12, 1, "", "create_sinusoidal_positions_yarn"], [77, 12, 1, "", "rotate_every_two"], [77, 12, 1, "", "rotate_half"]], "tensorrt_llm.functional.RotaryScalingType": [[77, 11, 1, "", "dynamic"], [77, 12, 1, "", "from_string"], [77, 11, 1, "", "linear"], [77, 11, 1, "", "llama3"], [77, 11, 1, "", "longrope"], [77, 11, 1, "", "mrope"], [77, 11, 1, "", "none"], [77, 11, 1, "", "yarn"]], "tensorrt_llm.functional.SideStreamIDType": [[77, 11, 1, "", "disable"], [77, 11, 1, "", "moe"]], "tensorrt_llm.functional.SliceInputType": [[77, 11, 1, "", "axes"], [77, 11, 1, "", "data"], [77, 11, 1, "", "fill_value"], [77, 11, 1, "", "size"], [77, 11, 1, "", "start"], [77, 11, 1, "", "stride"]], "tensorrt_llm.functional.Tensor": [[77, 12, 1, "", "abs"], [77, 12, 1, "", "cast"], [77, 13, 1, "", "dtype"], [77, 12, 1, "", "flatten"], [77, 12, 1, "", "get_parent"], [77, 12, 1, "", "get_users"], [77, 12, 1, "", "is_dynamic"], [77, 12, 1, "", "is_trt_wrapper"], [77, 13, 1, "", "location"], [77, 12, 1, "", "log"], [77, 12, 1, "", "mark_output"], [77, 12, 1, "", "max"], [77, 12, 1, "", "mean"], [77, 13, 1, "", "name"], [77, 12, 1, "", "ndim"], [77, 13, 1, "", "network"], [77, 12, 1, "", "permute"], [77, 12, 1, "", "rank"], [77, 12, 1, "", "repeat"], [77, 12, 1, "", "replace_all_uses_with"], [77, 12, 1, "", "select"], [77, 13, 1, "", "shape"], [77, 12, 1, "", "size"], [77, 12, 1, "", "split"], [77, 12, 1, "", "sqrt"], [77, 12, 1, "", "squeeze"], [77, 12, 1, "", "transpose"], [77, 12, 1, "", "unbind"], [77, 12, 1, "", "unsqueeze"], [77, 12, 1, "", "view"]], "tensorrt_llm.layers": [[78, 9, 0, "-", "activation"], [78, 9, 0, "-", "attention"], [78, 9, 0, "-", "cast"], [78, 9, 0, "-", "conv"], [78, 9, 0, "-", "embedding"], [78, 9, 0, "-", "linear"], [78, 9, 0, "-", "mlp"], [78, 9, 0, "-", "normalization"], [78, 9, 0, "-", "pooling"]], "tensorrt_llm.layers.activation": [[78, 10, 1, "", "Mish"]], "tensorrt_llm.layers.activation.Mish": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.attention": [[78, 10, 1, "", "Attention"], [78, 10, 1, "", "AttentionMaskParams"], [78, 10, 1, "", "AttentionParams"], [78, 10, 1, "", "BertAttention"], [78, 10, 1, "", "BlockSparseAttnParams"], [78, 10, 1, "", "CogVLMAttention"], [78, 10, 1, "", "DeepseekV2Attention"], [78, 10, 1, "", "DiffusersAttention"], [78, 10, 1, "", "KeyValueCacheParams"], [78, 10, 1, "", "MropeParams"], [78, 10, 1, "", "SpecDecodingParams"], [78, 14, 1, "", "compute_relative_bias"], [78, 14, 1, "", "make_causal_mask"]], "tensorrt_llm.layers.attention.Attention": [[78, 12, 1, "", "create_attention_const_params"], [78, 12, 1, "", "fill_attention_params"], [78, 12, 1, "", "forward"], [78, 12, 1, "", "postprocess"], [78, 12, 1, "", "set_rel_attn_table"]], "tensorrt_llm.layers.attention.AttentionParams": [[78, 12, 1, "", "fill_attention_const_params_for_long_rope"], [78, 12, 1, "", "fill_attention_const_params_for_rope"], [78, 12, 1, "", "is_valid"], [78, 12, 1, "", "is_valid_cross_attn"]], "tensorrt_llm.layers.attention.BertAttention": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.attention.CogVLMAttention": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.attention.DeepseekV2Attention": [[78, 12, 1, "", "forward"], [78, 12, 1, "", "postprocess"], [78, 12, 1, "", "weight_loader"]], "tensorrt_llm.layers.attention.DiffusersAttention": [[78, 12, 1, "", "forward"], [78, 12, 1, "", "joint_attn_forward"]], "tensorrt_llm.layers.attention.KeyValueCacheParams": [[78, 12, 1, "", "fill_none_tensor_list"], [78, 12, 1, "", "get_first_past_key_value"], [78, 12, 1, "", "is_valid"]], "tensorrt_llm.layers.cast": [[78, 10, 1, "", "Cast"]], "tensorrt_llm.layers.cast.Cast": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.conv": [[78, 10, 1, "", "Conv1d"], [78, 10, 1, "", "Conv2d"], [78, 10, 1, "", "Conv3d"], [78, 10, 1, "", "ConvTranspose2d"]], "tensorrt_llm.layers.conv.Conv1d": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.conv.Conv2d": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.conv.Conv3d": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.conv.ConvTranspose2d": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding": [[78, 10, 1, "", "CombinedTimestepLabelEmbeddings"], [78, 10, 1, "", "CombinedTimestepTextProjEmbeddings"], [78, 10, 1, "", "Embedding"], [78, 10, 1, "", "LabelEmbedding"], [78, 10, 1, "", "PixArtAlphaTextProjection"], [78, 10, 1, "", "PromptTuningEmbedding"], [78, 10, 1, "", "SD3PatchEmbed"], [78, 10, 1, "", "TimestepEmbedding"], [78, 10, 1, "", "Timesteps"], [78, 14, 1, "", "get_1d_sincos_pos_embed_from_grid"], [78, 14, 1, "", "get_2d_sincos_pos_embed"], [78, 14, 1, "", "get_2d_sincos_pos_embed_from_grid"], [78, 14, 1, "", "get_timestep_embedding"]], "tensorrt_llm.layers.embedding.CombinedTimestepLabelEmbeddings": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.CombinedTimestepTextProjEmbeddings": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.Embedding": [[78, 12, 1, "", "forward"], [78, 12, 1, "", "postprocess"], [78, 12, 1, "", "weight_loader"]], "tensorrt_llm.layers.embedding.LabelEmbedding": [[78, 12, 1, "", "forward"], [78, 12, 1, "", "token_drop"]], "tensorrt_llm.layers.embedding.PixArtAlphaTextProjection": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.PromptTuningEmbedding": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.SD3PatchEmbed": [[78, 12, 1, "", "cropped_pos_embed"], [78, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.TimestepEmbedding": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.Timesteps": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.linear": [[78, 11, 1, "", "ColumnLinear"], [78, 10, 1, "", "Linear"], [78, 10, 1, "", "LinearBase"], [78, 10, 1, "", "RowLinear"]], "tensorrt_llm.layers.linear.Linear": [[78, 12, 1, "", "collect_and_bias"], [78, 12, 1, "", "postprocess"], [78, 12, 1, "", "tp_split_dim"]], "tensorrt_llm.layers.linear.LinearBase": [[78, 12, 1, "", "collect_and_bias"], [78, 12, 1, "", "forward"], [78, 12, 1, "", "get_weight"], [78, 12, 1, "", "multiply_and_lora"], [78, 12, 1, "", "multiply_collect"], [78, 12, 1, "", "tp_split_dim"], [78, 12, 1, "", "weight_loader"]], "tensorrt_llm.layers.linear.RowLinear": [[78, 12, 1, "", "collect_and_bias"], [78, 12, 1, "", "multiply_collect"], [78, 12, 1, "", "tp_split_dim"]], "tensorrt_llm.layers.mlp": [[78, 10, 1, "", "FusedGatedMLP"], [78, 10, 1, "", "GatedMLP"], [78, 10, 1, "", "LinearActivation"], [78, 10, 1, "", "LinearApproximateGELU"], [78, 10, 1, "", "LinearGEGLU"], [78, 10, 1, "", "LinearGELU"], [78, 10, 1, "", "LinearSwiGLU"], [78, 10, 1, "", "MLP"], [78, 14, 1, "", "fc_gate_dora"], [78, 14, 1, "", "fc_gate_lora"]], "tensorrt_llm.layers.mlp.FusedGatedMLP": [[78, 12, 1, "", "fc_gate"], [78, 12, 1, "", "fc_gate_plugin"], [78, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.GatedMLP": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearActivation": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearApproximateGELU": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearGEGLU": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearGELU": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearSwiGLU": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.MLP": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization": [[78, 10, 1, "", "AdaLayerNorm"], [78, 10, 1, "", "AdaLayerNormContinuous"], [78, 10, 1, "", "AdaLayerNormZero"], [78, 10, 1, "", "AdaLayerNormZeroSingle"], [78, 10, 1, "", "GroupNorm"], [78, 10, 1, "", "LayerNorm"], [78, 10, 1, "", "RmsNorm"], [78, 10, 1, "", "SD35AdaLayerNormZeroX"]], "tensorrt_llm.layers.normalization.AdaLayerNorm": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.AdaLayerNormContinuous": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.AdaLayerNormZero": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.AdaLayerNormZeroSingle": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.GroupNorm": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.LayerNorm": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.RmsNorm": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.SD35AdaLayerNormZeroX": [[78, 12, 1, "", "forward"]], "tensorrt_llm.layers.pooling": [[78, 10, 1, "", "AvgPool2d"]], "tensorrt_llm.layers.pooling.AvgPool2d": [[78, 12, 1, "", "forward"]], "tensorrt_llm.llmapi": [[65, 10, 1, "", "BatchingType"], [65, 10, 1, "", "BuildCacheConfig"], [65, 10, 1, "", "BuildConfig"], [65, 10, 1, "", "CacheTransceiverConfig"], [65, 10, 1, "", "CalibConfig"], [65, 10, 1, "", "CapacitySchedulerPolicy"], [65, 10, 1, "", "CompletionOutput"], [65, 10, 1, "", "ContextChunkingPolicy"], [65, 10, 1, "", "DisaggregatedParams"], [65, 10, 1, "", "DynamicBatchConfig"], [65, 10, 1, "", "EagleDecodingConfig"], [65, 10, 1, "", "ExtendedRuntimePerfKnobConfig"], [65, 10, 1, "", "GuidedDecodingParams"], [65, 10, 1, "", "KvCacheConfig"], [65, 10, 1, "", "KvCacheRetentionConfig"], [65, 10, 1, "", "LLM"], [65, 10, 1, "", "LookaheadDecodingConfig"], [65, 10, 1, "", "MTPDecodingConfig"], [65, 10, 1, "", "MedusaDecodingConfig"], [65, 10, 1, "", "MpiCommSession"], [65, 10, 1, "", "QuantAlgo"], [65, 10, 1, "", "QuantConfig"], [65, 10, 1, "", "RequestError"], [65, 10, 1, "", "RequestOutput"], [65, 10, 1, "", "SamplingParams"], [65, 10, 1, "", "SchedulerConfig"]], "tensorrt_llm.llmapi.BatchingType": [[65, 11, 1, "", "INFLIGHT"], [65, 11, 1, "", "STATIC"]], "tensorrt_llm.llmapi.BuildCacheConfig": [[65, 12, 1, "", "__init__"], [65, 13, 1, "id7", "cache_root"], [65, 13, 1, "id8", "max_cache_storage_gb"], [65, 13, 1, "id9", "max_records"]], "tensorrt_llm.llmapi.BuildConfig": [[65, 12, 1, "", "__init__"], [65, 11, 1, "", "auto_parallel_config"], [65, 11, 1, "", "dry_run"], [65, 11, 1, "", "enable_debug_output"], [65, 11, 1, "", "force_num_profiles"], [65, 12, 1, "", "from_dict"], [65, 12, 1, "", "from_json_file"], [65, 11, 1, "", "gather_context_logits"], [65, 11, 1, "", "gather_generation_logits"], [65, 11, 1, "", "input_timing_cache"], [65, 11, 1, "", "kv_cache_type"], [65, 11, 1, "", "lora_config"], [65, 11, 1, "", "max_batch_size"], [65, 11, 1, "", "max_beam_width"], [65, 11, 1, "", "max_draft_len"], [65, 11, 1, "", "max_encoder_input_len"], [65, 11, 1, "", "max_input_len"], [65, 11, 1, "", "max_num_tokens"], [65, 11, 1, "", "max_prompt_embedding_table_size"], [65, 11, 1, "", "max_seq_len"], [65, 11, 1, "", "monitor_memory"], [65, 11, 1, "", "opt_batch_size"], [65, 11, 1, "", "opt_num_tokens"], [65, 11, 1, "", "output_timing_cache"], [65, 11, 1, "", "plugin_config"], [65, 11, 1, "", "profiling_verbosity"], [65, 11, 1, "", "speculative_decoding_mode"], [65, 11, 1, "", "strongly_typed"], [65, 12, 1, "", "to_dict"], [65, 12, 1, "", "update"], [65, 12, 1, "", "update_from_dict"], [65, 12, 1, "", "update_kv_cache_type"], [65, 11, 1, "", "use_mrope"], [65, 11, 1, "", "use_refit"], [65, 11, 1, "", "use_strip_plan"], [65, 11, 1, "", "visualize_network"], [65, 11, 1, "", "weight_sparsity"], [65, 11, 1, "", "weight_streaming"]], "tensorrt_llm.llmapi.CacheTransceiverConfig": [[65, 15, 1, "", "max_num_tokens"], [65, 11, 1, "", "model_config"]], "tensorrt_llm.llmapi.CalibConfig": [[65, 15, 1, "", "calib_batch_size"], [65, 15, 1, "", "calib_batches"], [65, 15, 1, "", "calib_dataset"], [65, 15, 1, "", "calib_max_seq_length"], [65, 15, 1, "", "device"], [65, 12, 1, "", "from_dict"], [65, 11, 1, "", "model_config"], [65, 15, 1, "", "random_seed"], [65, 12, 1, "", "to_dict"], [65, 15, 1, "", "tokenizer_max_seq_length"]], "tensorrt_llm.llmapi.CapacitySchedulerPolicy": [[65, 11, 1, "", "GUARANTEED_NO_EVICT"], [65, 11, 1, "", "MAX_UTILIZATION"], [65, 11, 1, "", "STATIC_BATCH"]], "tensorrt_llm.llmapi.CompletionOutput": [[65, 12, 1, "", "__init__"], [65, 11, 1, "", "cumulative_logprob"], [65, 11, 1, "", "disaggregated_params"], [65, 11, 1, "", "finish_reason"], [65, 11, 1, "", "generation_logits"], [65, 11, 1, "", "index"], [65, 13, 1, "id2", "length"], [65, 11, 1, "", "logprobs"], [65, 13, 1, "id3", "logprobs_diff"], [65, 11, 1, "", "prompt_logprobs"], [65, 11, 1, "", "stop_reason"], [65, 11, 1, "", "text"], [65, 13, 1, "id4", "text_diff"], [65, 11, 1, "", "token_ids"], [65, 13, 1, "id5", "token_ids_diff"]], "tensorrt_llm.llmapi.ContextChunkingPolicy": [[65, 11, 1, "", "EQUAL_PROGRESS"], [65, 11, 1, "", "FIRST_COME_FIRST_SERVED"]], "tensorrt_llm.llmapi.DisaggregatedParams": [[65, 12, 1, "", "__init__"], [65, 11, 1, "", "ctx_request_id"], [65, 11, 1, "", "draft_tokens"], [65, 11, 1, "", "first_gen_tokens"], [65, 12, 1, "", "get_context_phase_params"], [65, 12, 1, "", "get_request_type"], [65, 11, 1, "", "opaque_state"], [65, 11, 1, "", "request_type"]], "tensorrt_llm.llmapi.DynamicBatchConfig": [[65, 15, 1, "", "dynamic_batch_moving_average_window"], [65, 15, 1, "", "enable_batch_size_tuning"], [65, 15, 1, "", "enable_max_num_tokens_tuning"], [65, 11, 1, "", "model_config"]], "tensorrt_llm.llmapi.EagleDecodingConfig": [[65, 11, 1, "", "decoding_type"], [65, 15, 1, "", "dynamic_tree_max_topK"], [65, 15, 1, "", "eagle_choices"], [65, 12, 1, "", "from_dict"], [65, 15, 1, "", "greedy_sampling"], [65, 15, 1, "", "max_non_leaves_per_layer"], [65, 11, 1, "", "model_config"], [65, 15, 1, "", "num_eagle_layers"], [65, 15, 1, "", "posterior_threshold"], [65, 15, 1, "", "pytorch_eagle_weights_path"], [65, 15, 1, "", "use_dynamic_tree"]], "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig": [[65, 15, 1, "", "cuda_graph_cache_size"], [65, 15, 1, "", "cuda_graph_mode"], [65, 15, 1, "", "enable_context_fmha_fp32_acc"], [65, 11, 1, "", "model_config"], [65, 15, 1, "", "multi_block_mode"]], "tensorrt_llm.llmapi.GuidedDecodingParams": [[65, 12, 1, "", "__init__"], [65, 11, 1, "", "grammar"], [65, 11, 1, "", "json"], [65, 11, 1, "", "json_object"], [65, 11, 1, "", "regex"], [65, 11, 1, "", "structural_tag"]], "tensorrt_llm.llmapi.KvCacheConfig": [[65, 15, 1, "", "copy_on_partial_reuse"], [65, 15, 1, "", "cross_kv_cache_fraction"], [65, 15, 1, "", "enable_block_reuse"], [65, 15, 1, "", "enable_partial_reuse"], [65, 15, 1, "", "event_buffer_max_size"], [65, 15, 1, "", "free_gpu_memory_fraction"], [65, 15, 1, "", "host_cache_size"], [65, 15, 1, "", "max_attention_window"], [65, 15, 1, "", "max_tokens"], [65, 11, 1, "", "model_config"], [65, 15, 1, "", "onboard_blocks"], [65, 15, 1, "", "secondary_offload_min_priority"], [65, 15, 1, "", "sink_token_length"]], "tensorrt_llm.llmapi.KvCacheRetentionConfig": [[65, 10, 1, "", "TokenRangeRetentionConfig"], [65, 12, 1, "", "__init__"], [65, 13, 1, "", "decode_duration_ms"], [65, 13, 1, "", "decode_retention_priority"], [65, 13, 1, "", "token_range_retention_configs"]], "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig": [[65, 12, 1, "", "__init__"], [65, 13, 1, "", "duration_ms"], [65, 13, 1, "", "priority"], [65, 13, 1, "", "token_end"], [65, 13, 1, "", "token_start"]], "tensorrt_llm.llmapi.LLM": [[65, 12, 1, "", "__init__"], [65, 12, 1, "", "generate"], [65, 12, 1, "", "generate_async"], [65, 12, 1, "", "get_kv_cache_events"], [65, 12, 1, "", "get_kv_cache_events_async"], [65, 12, 1, "", "get_stats"], [65, 12, 1, "", "get_stats_async"], [65, 12, 1, "", "save"], [65, 12, 1, "", "shutdown"], [65, 13, 1, "id0", "tokenizer"], [65, 13, 1, "id1", "workspace"]], "tensorrt_llm.llmapi.LookaheadDecodingConfig": [[65, 12, 1, "", "__init__"], [65, 12, 1, "", "calculate_speculative_resource"], [65, 11, 1, "", "decoding_type"], [65, 12, 1, "", "from_dict"], [65, 15, 1, "", "max_ngram_size"], [65, 15, 1, "", "max_verification_set_size"], [65, 15, 1, "", "max_window_size"], [65, 11, 1, "", "model_config"], [65, 12, 1, "", "validate_positive_values"]], "tensorrt_llm.llmapi.MTPDecodingConfig": [[65, 11, 1, "", "decoding_type"], [65, 12, 1, "", "from_dict"], [65, 11, 1, "", "model_config"], [65, 15, 1, "", "num_nextn_predict_layers"], [65, 15, 1, "", "relaxed_delta"], [65, 15, 1, "", "relaxed_topk"], [65, 15, 1, "", "use_relaxed_acceptance_for_thinking"]], "tensorrt_llm.llmapi.MedusaDecodingConfig": [[65, 11, 1, "", "decoding_type"], [65, 12, 1, "", "from_dict"], [65, 15, 1, "", "medusa_choices"], [65, 11, 1, "", "model_config"], [65, 15, 1, "", "num_medusa_heads"]], "tensorrt_llm.llmapi.MpiCommSession": [[65, 12, 1, "", "__init__"], [65, 12, 1, "", "abort"], [65, 12, 1, "", "get_comm"], [65, 12, 1, "", "shutdown"], [65, 12, 1, "", "submit"], [65, 12, 1, "", "submit_sync"]], "tensorrt_llm.llmapi.QuantAlgo": [[65, 11, 1, "", "FP8"], [65, 11, 1, "", "FP8_BLOCK_SCALES"], [65, 11, 1, "", "FP8_PER_CHANNEL_PER_TOKEN"], [65, 11, 1, "", "INT8"], [65, 11, 1, "", "MIXED_PRECISION"], [65, 11, 1, "", "NO_QUANT"], [65, 11, 1, "", "NVFP4"], [65, 11, 1, "", "W4A16"], [65, 11, 1, "", "W4A16_AWQ"], [65, 11, 1, "", "W4A16_GPTQ"], [65, 11, 1, "", "W4A8_AWQ"], [65, 11, 1, "", "W4A8_QSERVE_PER_CHANNEL"], [65, 11, 1, "", "W4A8_QSERVE_PER_GROUP"], [65, 11, 1, "", "W8A16"], [65, 11, 1, "", "W8A16_GPTQ"], [65, 11, 1, "", "W8A8_SQ_PER_CHANNEL"], [65, 11, 1, "", "W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN"], [65, 11, 1, "", "W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN"], [65, 11, 1, "", "W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN"], [65, 11, 1, "", "W8A8_SQ_PER_TENSOR_PLUGIN"]], "tensorrt_llm.llmapi.QuantConfig": [[65, 12, 1, "", "__init__"], [65, 11, 1, "", "clamp_val"], [65, 11, 1, "", "exclude_modules"], [65, 12, 1, "", "from_dict"], [65, 11, 1, "", "group_size"], [65, 11, 1, "", "has_zero_point"], [65, 12, 1, "", "is_module_excluded_from_quantization"], [65, 11, 1, "", "kv_cache_quant_algo"], [65, 13, 1, "", "layer_quant_mode"], [65, 11, 1, "", "pre_quant_scale"], [65, 11, 1, "", "quant_algo"], [65, 13, 1, "", "quant_mode"], [65, 11, 1, "", "smoothquant_val"], [65, 12, 1, "", "to_dict"], [65, 11, 1, "", "use_meta_recipe"]], "tensorrt_llm.llmapi.RequestOutput": [[65, 12, 1, "", "__init__"], [65, 11, 1, "", "context_logits"], [65, 11, 1, "", "finished"], [65, 11, 1, "", "outputs"], [65, 13, 1, "id6", "prompt"], [65, 11, 1, "", "prompt_token_ids"], [65, 11, 1, "", "request_id"]], "tensorrt_llm.llmapi.SamplingParams": [[65, 12, 1, "", "__init__"], [65, 11, 1, "", "add_special_tokens"], [65, 11, 1, "", "additional_model_outputs"], [65, 11, 1, "", "apply_batched_logits_processor"], [65, 11, 1, "", "bad"], [65, 11, 1, "", "bad_token_ids"], [65, 11, 1, "", "beam_search_diversity_rate"], [65, 11, 1, "", "beam_width_array"], [65, 11, 1, "", "best_of"], [65, 11, 1, "", "detokenize"], [65, 11, 1, "", "early_stopping"], [65, 11, 1, "", "embedding_bias"], [65, 11, 1, "", "end_id"], [65, 11, 1, "", "exclude_input_from_output"], [65, 11, 1, "", "frequency_penalty"], [65, 11, 1, "", "guided_decoding"], [65, 11, 1, "", "ignore_eos"], [65, 11, 1, "", "include_stop_str_in_output"], [65, 11, 1, "", "length_penalty"], [65, 11, 1, "", "logits_processor"], [65, 11, 1, "", "logprobs"], [65, 11, 1, "", "lookahead_config"], [65, 11, 1, "", "max_tokens"], [65, 11, 1, "", "min_p"], [65, 11, 1, "", "min_tokens"], [65, 11, 1, "", "n"], [65, 11, 1, "", "no_repeat_ngram_size"], [65, 11, 1, "", "pad_id"], [65, 11, 1, "", "presence_penalty"], [65, 11, 1, "", "prompt_logprobs"], [65, 11, 1, "", "repetition_penalty"], [65, 11, 1, "", "return_context_logits"], [65, 11, 1, "", "return_encoder_output"], [65, 11, 1, "", "return_generation_logits"], [65, 11, 1, "", "return_perf_metrics"], [65, 11, 1, "", "seed"], [65, 11, 1, "", "skip_special_tokens"], [65, 11, 1, "", "spaces_between_special_tokens"], [65, 11, 1, "", "stop"], [65, 11, 1, "", "stop_token_ids"], [65, 11, 1, "", "temperature"], [65, 11, 1, "", "top_k"], [65, 11, 1, "", "top_p"], [65, 11, 1, "", "top_p_decay"], [65, 11, 1, "", "top_p_min"], [65, 11, 1, "", "top_p_reset_ids"], [65, 11, 1, "", "truncate_prompt_tokens"], [65, 11, 1, "", "use_beam_search"]], "tensorrt_llm.llmapi.SchedulerConfig": [[65, 15, 1, "", "capacity_scheduler_policy"], [65, 15, 1, "", "context_chunking_policy"], [65, 15, 1, "", "dynamic_batch_config"], [65, 11, 1, "", "model_config"]], "tensorrt_llm.models": [[79, 10, 1, "", "BaichuanForCausalLM"], [79, 10, 1, "", "BertForQuestionAnswering"], [79, 10, 1, "", "BertForSequenceClassification"], [79, 10, 1, "", "BertModel"], [79, 10, 1, "", "BloomForCausalLM"], [79, 10, 1, "", "BloomModel"], [79, 10, 1, "", "CLIPVisionTransformer"], [79, 10, 1, "", "ChatGLMConfig"], [79, 10, 1, "", "ChatGLMForCausalLM"], [79, 10, 1, "", "ChatGLMModel"], [79, 10, 1, "", "CogVLMConfig"], [79, 10, 1, "", "CogVLMForCausalLM"], [79, 10, 1, "", "CohereForCausalLM"], [79, 10, 1, "", "DbrxConfig"], [79, 10, 1, "", "DbrxForCausalLM"], [79, 10, 1, "", "DecoderModel"], [79, 10, 1, "", "DeepseekForCausalLM"], [79, 10, 1, "", "DeepseekV2ForCausalLM"], [79, 10, 1, "", "DiT"], [79, 10, 1, "", "EagleForCausalLM"], [79, 10, 1, "", "EncoderModel"], [79, 10, 1, "", "FalconConfig"], [79, 10, 1, "", "FalconForCausalLM"], [79, 10, 1, "", "FalconModel"], [79, 10, 1, "", "GPTConfig"], [79, 10, 1, "", "GPTForCausalLM"], [79, 10, 1, "", "GPTJConfig"], [79, 10, 1, "", "GPTJForCausalLM"], [79, 10, 1, "", "GPTJModel"], [79, 10, 1, "", "GPTModel"], [79, 10, 1, "", "GPTNeoXForCausalLM"], [79, 10, 1, "", "GPTNeoXModel"], [79, 10, 1, "", "GemmaConfig"], [79, 10, 1, "", "GemmaForCausalLM"], [79, 10, 1, "", "LLaMAConfig"], [79, 10, 1, "", "LLaMAForCausalLM"], [79, 10, 1, "", "LLaMAModel"], [79, 10, 1, "", "LlavaNextVisionConfig"], [79, 10, 1, "", "LlavaNextVisionWrapper"], [79, 10, 1, "", "MLLaMAForCausalLM"], [79, 10, 1, "", "MPTForCausalLM"], [79, 10, 1, "", "MPTModel"], [79, 10, 1, "", "MambaForCausalLM"], [79, 10, 1, "", "MedusaConfig"], [79, 10, 1, "", "MedusaForCausalLm"], [79, 10, 1, "", "OPTForCausalLM"], [79, 10, 1, "", "OPTModel"], [79, 10, 1, "", "Phi3ForCausalLM"], [79, 10, 1, "", "Phi3Model"], [79, 10, 1, "", "PhiForCausalLM"], [79, 10, 1, "", "PhiModel"], [79, 10, 1, "", "PretrainedConfig"], [79, 10, 1, "", "PretrainedModel"], [79, 10, 1, "", "ReDrafterForCausalLM"], [79, 10, 1, "", "RecurrentGemmaForCausalLM"], [79, 11, 1, "", "RobertaForQuestionAnswering"], [79, 11, 1, "", "RobertaForSequenceClassification"], [79, 11, 1, "", "RobertaModel"], [79, 10, 1, "", "SD3Transformer2DModel"], [79, 10, 1, "", "SpeculativeDecodingMode"], [79, 10, 1, "", "WhisperEncoder"]], "tensorrt_llm.models.BaichuanForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "quantize"]], "tensorrt_llm.models.BertForQuestionAnswering": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.BertForSequenceClassification": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.BertModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.BloomModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.CLIPVisionTransformer": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.ChatGLMConfig": [[79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "to_dict"]], "tensorrt_llm.models.ChatGLMForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "prepare_inputs"], [79, 12, 1, "", "quantize"]], "tensorrt_llm.models.ChatGLMModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.CogVLMConfig": [[79, 12, 1, "", "to_dict"]], "tensorrt_llm.models.CogVLMForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "default_plugin_config"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "quantize"]], "tensorrt_llm.models.CohereForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.DbrxConfig": [[79, 12, 1, "", "to_dict"]], "tensorrt_llm.models.DbrxForCausalLM": [[79, 11, 1, "", "config_class"]], "tensorrt_llm.models.DecoderModel": [[79, 12, 1, "", "check_config"], [79, 12, 1, "", "forward"], [79, 12, 1, "", "precompute_relative_attention_bias"], [79, 12, 1, "", "prepare_inputs"], [79, 12, 1, "", "use_lora"]], "tensorrt_llm.models.DeepseekForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.DeepseekV2ForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.DiT": [[79, 12, 1, "", "check_config"], [79, 12, 1, "", "forward"], [79, 12, 1, "", "forward_with_cfg"], [79, 12, 1, "", "forward_without_cfg"], [79, 12, 1, "", "prepare_inputs"], [79, 12, 1, "", "unpatchify"]], "tensorrt_llm.models.EagleForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "forward"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "prepare_inputs"]], "tensorrt_llm.models.EncoderModel": [[79, 12, 1, "", "check_config"], [79, 12, 1, "", "forward"], [79, 12, 1, "", "precompute_relative_attention_bias"], [79, 12, 1, "", "prepare_inputs"], [79, 12, 1, "", "use_lora"], [79, 12, 1, "", "use_prompt_tuning"]], "tensorrt_llm.models.FalconConfig": [[79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "to_dict"]], "tensorrt_llm.models.FalconForCausalLM": [[79, 12, 1, "", "check_config"], [79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.FalconModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.GPTConfig": [[79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "from_nemo"], [79, 12, 1, "", "to_dict"]], "tensorrt_llm.models.GPTForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "from_nemo"], [79, 12, 1, "", "quantize"], [79, 12, 1, "", "use_lora"]], "tensorrt_llm.models.GPTJConfig": [[79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "to_dict"]], "tensorrt_llm.models.GPTJForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.GPTJModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.GPTModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.GPTNeoXModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.GemmaConfig": [[79, 11, 1, "", "GEMMA2_ADDED_FIELDS"], [79, 11, 1, "", "GEMMA3_ADDED_FIELDS"], [79, 11, 1, "", "GEMMA_ADDED_FIELDS"], [79, 11, 1, "", "VERBATIM"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "gemma2_config"], [79, 12, 1, "", "gemma3_config"], [79, 12, 1, "", "get_hf_config"], [79, 13, 1, "", "is_gemma_2"], [79, 13, 1, "", "is_gemma_3"], [79, 12, 1, "", "to_dict"]], "tensorrt_llm.models.GemmaForCausalLM": [[79, 11, 1, "", "NATIVE_QUANT_FLOW"], [79, 12, 1, "", "assert_valid_quant_algo"], [79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "quantize"], [79, 12, 1, "", "use_lora"]], "tensorrt_llm.models.LLaMAConfig": [[79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "from_meta_ckpt"], [79, 12, 1, "", "to_dict"]], "tensorrt_llm.models.LLaMAForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "default_plugin_config"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "from_meta_ckpt"], [79, 12, 1, "", "quantize"], [79, 12, 1, "", "use_lora"]], "tensorrt_llm.models.LLaMAModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.LlavaNextVisionConfig": [[79, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.LlavaNextVisionWrapper": [[79, 12, 1, "", "forward"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "prepare_inputs"], [79, 12, 1, "", "save_checkpoint"]], "tensorrt_llm.models.MLLaMAForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "forward"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "prepare_inputs"], [79, 12, 1, "", "use_lora"]], "tensorrt_llm.models.MPTForCausalLM": [[79, 12, 1, "", "check_config"]], "tensorrt_llm.models.MPTModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.MambaForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "forward"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "prepare_inputs"]], "tensorrt_llm.models.MedusaConfig": [[79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "to_dict"]], "tensorrt_llm.models.MedusaForCausalLm": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.OPTForCausalLM": [[79, 12, 1, "", "check_config"]], "tensorrt_llm.models.OPTModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.Phi3ForCausalLM": [[79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "use_lora"]], "tensorrt_llm.models.Phi3Model": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.PhiForCausalLM": [[79, 12, 1, "", "check_config"], [79, 11, 1, "", "config_class"], [79, 12, 1, "", "from_hugging_face"], [79, 12, 1, "", "use_lora"]], "tensorrt_llm.models.PhiModel": [[79, 12, 1, "", "forward"]], "tensorrt_llm.models.PretrainedConfig": [[79, 12, 1, "", "create_runtime_defaults"], [79, 12, 1, "", "for_each_rank"], [79, 12, 1, "", "from_checkpoint"], [79, 12, 1, "", "from_dict"], [79, 12, 1, "", "from_json_file"], [79, 12, 1, "", "get_config_group"], [79, 12, 1, "", "has_config_group"], [79, 13, 1, "", "kv_dtype"], [79, 13, 1, "", "quant_algo"], [79, 13, 1, "", "quant_mode"], [79, 12, 1, "", "set_if_not_exist"], [79, 12, 1, "", "set_rank"], [79, 12, 1, "", "to_dict"], [79, 12, 1, "", "to_json_file"], [79, 12, 1, "", "to_layer_quant_config"]], "tensorrt_llm.models.PretrainedModel": [[79, 12, 1, "", "check_config"], [79, 12, 1, "", "from_checkpoint"], [79, 12, 1, "", "from_config"], [79, 12, 1, "", "load"], [79, 12, 1, "", "prepare_inputs"], [79, 12, 1, "", "quantize"], [79, 12, 1, "", "release"], [79, 12, 1, "", "save_checkpoint"]], "tensorrt_llm.models.ReDrafterForCausalLM": [[79, 12, 1, "", "forward"], [79, 12, 1, "", "prepare_inputs"]], "tensorrt_llm.models.RecurrentGemmaForCausalLM": [[79, 12, 1, "", "forward"], [79, 12, 1, "", "prepare_inputs"], [79, 12, 1, "", "prepare_recurrent_inputs"]], "tensorrt_llm.models.SD3Transformer2DModel": [[79, 13, 1, "", "attn_processors"], [79, 11, 1, "", "config_class"], [79, 12, 1, "", "disable_forward_chunking"], [79, 12, 1, "", "enable_forward_chunking"], [79, 12, 1, "", "forward"], [79, 12, 1, "", "from_pretrained"], [79, 12, 1, "", "fuse_qkv_projections"], [79, 12, 1, "", "load"], [79, 12, 1, "", "prepare_inputs"], [79, 12, 1, "", "set_attn_processor"], [79, 12, 1, "", "unfuse_qkv_projections"]], "tensorrt_llm.models.SpeculativeDecodingMode": [[79, 11, 1, "", "DRAFT_TOKENS_EXTERNAL"], [79, 11, 1, "", "EAGLE"], [79, 11, 1, "", "EXPLICIT_DRAFT_TOKENS"], [79, 11, 1, "", "LOOKAHEAD_DECODING"], [79, 11, 1, "", "MEDUSA"], [79, 11, 1, "", "NONE"], [79, 12, 1, "", "from_arguments"]], "tensorrt_llm.models.WhisperEncoder": [[79, 12, 1, "", "forward"], [79, 12, 1, "", "precompute_relative_attention_bias"], [79, 12, 1, "", "prepare_inputs"]], "tensorrt_llm.plugin": [[80, 10, 1, "", "PluginConfig"]], "tensorrt_llm.plugin.PluginConfig": [[80, 12, 1, "", "to_legacy_setting"]], "tensorrt_llm.quantization": [[81, 10, 1, "", "QuantAlgo"], [81, 10, 1, "", "QuantMode"], [81, 14, 1, "", "quantize_and_export"]], "tensorrt_llm.runtime": [[82, 10, 1, "", "ChatGLMGenerationSession"], [82, 10, 1, "", "EncDecModelRunner"], [82, 10, 1, "", "GenerationSequence"], [82, 10, 1, "", "GenerationSession"], [82, 10, 1, "", "KVCacheManager"], [82, 10, 1, "", "LogitsProcessor"], [82, 10, 1, "", "LogitsProcessorList"], [82, 10, 1, "", "ModelConfig"], [82, 10, 1, "", "ModelRunner"], [82, 10, 1, "", "ModelRunnerCpp"], [82, 10, 1, "", "MultimodalModelRunner"], [82, 10, 1, "", "QWenForCausalLMGenerationSession"], [82, 10, 1, "", "SamplingConfig"], [82, 10, 1, "", "Session"], [82, 10, 1, "", "StoppingCriteria"], [82, 10, 1, "", "StoppingCriteriaList"], [82, 10, 1, "", "TensorInfo"], [82, 14, 1, "", "decode_words_list"]], "tensorrt_llm.runtime.EncDecModelRunner": [[82, 12, 1, "", "encoder_run"], [82, 12, 1, "", "from_engine"], [82, 12, 1, "", "generate"], [82, 12, 1, "", "process_input"]], "tensorrt_llm.runtime.GenerationSequence": [[82, 12, 1, "", "get_batch_idx"], [82, 12, 1, "", "get_seq_idx"]], "tensorrt_llm.runtime.GenerationSession": [[82, 11, 1, "", "batch_size"], [82, 11, 1, "", "buffer_allocated"], [82, 13, 1, "", "context_mem_size"], [82, 13, 1, "", "conv_kernel"], [82, 13, 1, "", "cross_attention"], [82, 11, 1, "", "cuda_graph_mode"], [82, 12, 1, "", "cuda_stream_guard"], [82, 11, 1, "", "debug_mode"], [82, 11, 1, "", "debug_tensors_to_save"], [82, 12, 1, "", "decode"], [82, 12, 1, "", "decode_batch"], [82, 12, 1, "", "decode_regular"], [82, 12, 1, "", "decode_stream"], [82, 11, 1, "", "device"], [82, 13, 1, "", "dtype"], [82, 12, 1, "", "dump_debug_buffers"], [82, 12, 1, "", "early_stop_criteria"], [82, 13, 1, "", "engine_inspector"], [82, 12, 1, "", "filter_medusa_logits"], [82, 12, 1, "", "finalize_decoder"], [82, 12, 1, "", "find_best_medusa_path"], [82, 13, 1, "", "first_layer"], [82, 13, 1, "", "gather_context_logits"], [82, 13, 1, "", "gather_generation_logits"], [82, 13, 1, "", "gemm_allreduce_plugin"], [82, 12, 1, "", "get_next_medusa_tokens"], [82, 12, 1, "", "get_num_heads_kv"], [82, 12, 1, "", "handle_per_step"], [82, 13, 1, "", "has_position_embedding"], [82, 13, 1, "", "has_token_type_embedding"], [82, 13, 1, "", "head_size"], [82, 13, 1, "", "hidden_size"], [82, 13, 1, "", "is_medusa_mode"], [82, 13, 1, "", "is_redrafter_mode"], [82, 13, 1, "", "kv_cache_type"], [82, 13, 1, "", "last_layer"], [82, 12, 1, "", "locate_accepted_draft_tokens"], [82, 11, 1, "", "mapping"], [82, 13, 1, "", "max_draft_tokens"], [82, 13, 1, "", "max_prompt_embedding_table_size"], [82, 12, 1, "", "medusa_decode_and_verify"], [82, 11, 1, "", "medusa_paths"], [82, 11, 1, "", "medusa_position_offsets"], [82, 11, 1, "", "medusa_temperature"], [82, 11, 1, "", "medusa_topks"], [82, 11, 1, "", "medusa_tree_ids"], [82, 12, 1, "", "next_medusa_input_ids"], [82, 11, 1, "", "num_draft_tokens"], [82, 13, 1, "", "num_heads"], [82, 13, 1, "", "num_layers"], [82, 13, 1, "", "num_medusa_heads"], [82, 13, 1, "", "paged_kv_cache"], [82, 13, 1, "", "paged_state"], [82, 12, 1, "", "pp_communicate_final_output_ids"], [82, 12, 1, "", "pp_communicate_new_tokens"], [82, 12, 1, "", "process_logits_including_draft"], [82, 13, 1, "", "profiler"], [82, 13, 1, "", "quant_mode"], [82, 13, 1, "", "remove_input_padding"], [82, 12, 1, "", "reorder_kv_cache_for_beam_search"], [82, 13, 1, "", "rnn_conv_dim_size"], [82, 13, 1, "", "rnn_head_size"], [82, 13, 1, "", "rnn_hidden_size"], [82, 11, 1, "", "runtime"], [82, 12, 1, "", "setup"], [82, 13, 1, "", "state_dtype"], [82, 13, 1, "", "state_size"], [82, 13, 1, "", "tokens_per_block"], [82, 12, 1, "", "update_output_ids_by_offset"], [82, 13, 1, "", "use_gemm_allreduce_plugin"], [82, 13, 1, "", "use_gpt_attention_plugin"], [82, 13, 1, "", "use_kv_cache"], [82, 13, 1, "", "use_lora_plugin"], [82, 13, 1, "", "use_mamba_conv1d_plugin"], [82, 13, 1, "", "vocab_size"]], "tensorrt_llm.runtime.KVCacheManager": [[82, 12, 1, "", "add_sequence"], [82, 12, 1, "", "get_block_offsets"], [82, 12, 1, "", "step"]], "tensorrt_llm.runtime.ModelConfig": [[82, 11, 1, "", "conv_kernel"], [82, 11, 1, "", "cross_attention"], [82, 11, 1, "", "dtype"], [82, 11, 1, "", "gather_context_logits"], [82, 11, 1, "", "gather_generation_logits"], [82, 11, 1, "", "gemm_allreduce_plugin"], [82, 11, 1, "", "gpt_attention_plugin"], [82, 11, 1, "", "gpu_weights_percent"], [82, 11, 1, "", "has_position_embedding"], [82, 11, 1, "", "has_token_type_embedding"], [82, 11, 1, "", "head_size"], [82, 11, 1, "", "hidden_size"], [82, 11, 1, "", "kv_cache_type"], [82, 11, 1, "", "language_adapter_config"], [82, 11, 1, "", "layer_types"], [82, 11, 1, "", "lora_plugin"], [82, 11, 1, "", "lora_target_modules"], [82, 11, 1, "", "mamba_conv1d_plugin"], [82, 11, 1, "", "max_batch_size"], [82, 11, 1, "", "max_beam_width"], [82, 11, 1, "", "max_medusa_tokens"], [82, 11, 1, "", "max_prompt_embedding_table_size"], [82, 11, 1, "", "model_name"], [82, 11, 1, "", "num_heads"], [82, 11, 1, "", "num_kv_heads"], [82, 11, 1, "", "num_kv_heads_per_cross_attn_layer"], [82, 11, 1, "", "num_kv_heads_per_layer"], [82, 11, 1, "", "num_layers"], [82, 11, 1, "", "num_medusa_heads"], [82, 11, 1, "", "paged_state"], [82, 11, 1, "", "quant_mode"], [82, 11, 1, "", "redrafter_draft_len_per_beam"], [82, 11, 1, "", "redrafter_num_beams"], [82, 11, 1, "", "remove_input_padding"], [82, 11, 1, "", "rnn_conv_dim_size"], [82, 11, 1, "", "rnn_head_size"], [82, 11, 1, "", "rnn_hidden_size"], [82, 11, 1, "", "skip_cross_attn_blocks"], [82, 11, 1, "", "skip_cross_kv"], [82, 11, 1, "", "state_dtype"], [82, 11, 1, "", "state_size"], [82, 11, 1, "", "tokens_per_block"], [82, 11, 1, "", "trtllm_modules_to_hf_modules"], [82, 11, 1, "", "vocab_size"]], "tensorrt_llm.runtime.ModelRunner": [[82, 13, 1, "", "dtype"], [82, 12, 1, "", "from_dir"], [82, 12, 1, "", "from_engine"], [82, 13, 1, "", "gather_context_logits"], [82, 13, 1, "", "gather_generation_logits"], [82, 12, 1, "", "generate"], [82, 13, 1, "", "hidden_size"], [82, 13, 1, "", "mapping"], [82, 13, 1, "", "max_prompt_embedding_table_size"], [82, 13, 1, "", "max_sequence_length"], [82, 13, 1, "", "num_heads"], [82, 13, 1, "", "num_layers"], [82, 13, 1, "", "remove_input_padding"], [82, 12, 1, "", "serialize_engine"], [82, 13, 1, "", "use_lora_plugin"], [82, 13, 1, "", "vocab_size"], [82, 13, 1, "", "vocab_size_padded"]], "tensorrt_llm.runtime.ModelRunnerCpp": [[82, 13, 1, "", "dtype"], [82, 12, 1, "", "from_dir"], [82, 13, 1, "", "gather_context_logits"], [82, 13, 1, "", "gather_generation_logits"], [82, 12, 1, "", "generate"], [82, 13, 1, "", "hidden_size"], [82, 13, 1, "", "max_prompt_embedding_table_size"], [82, 13, 1, "", "max_sequence_length"], [82, 13, 1, "", "num_heads"], [82, 13, 1, "", "num_layers"], [82, 13, 1, "", "remove_input_padding"], [82, 13, 1, "", "vocab_size"], [82, 13, 1, "", "vocab_size_padded"]], "tensorrt_llm.runtime.MultimodalModelRunner": [[82, 13, 1, "", "audio_engine_dir"], [82, 13, 1, "", "cpp_e2e"], [82, 13, 1, "", "cpp_llm_only"], [82, 12, 1, "", "generate"], [82, 12, 1, "", "get_audio_features"], [82, 12, 1, "", "get_rope_index"], [82, 12, 1, "", "get_visual_features"], [82, 12, 1, "", "init_audio_encoder"], [82, 12, 1, "", "init_image_encoder"], [82, 12, 1, "", "init_llm"], [82, 12, 1, "", "init_processor"], [82, 12, 1, "", "init_tokenizer"], [82, 13, 1, "", "llm_engine_dir"], [82, 12, 1, "", "load_test_audio"], [82, 12, 1, "", "load_test_data"], [82, 12, 1, "", "prepare_position_ids_for_cogvlm"], [82, 12, 1, "", "preprocess"], [82, 12, 1, "", "ptuning_setup"], [82, 12, 1, "", "ptuning_setup_fuyu"], [82, 12, 1, "", "ptuning_setup_llava_next"], [82, 12, 1, "", "ptuning_setup_phi3"], [82, 12, 1, "", "ptuning_setup_pixtral"], [82, 13, 1, "", "python_e2e"], [82, 12, 1, "", "run"], [82, 12, 1, "", "setup_fake_prompts"], [82, 12, 1, "", "setup_fake_prompts_qwen2vl"], [82, 12, 1, "", "setup_fake_prompts_vila"], [82, 12, 1, "", "setup_inputs"], [82, 12, 1, "", "split_prompt_by_images"], [82, 12, 1, "", "tokenizer_image_token"], [82, 12, 1, "", "video_preprocess"], [82, 13, 1, "", "visual_engine_dir"]], "tensorrt_llm.runtime.QWenForCausalLMGenerationSession": [[82, 12, 1, "", "generate"]], "tensorrt_llm.runtime.SamplingConfig": [[82, 11, 1, "", "bad_words_list"], [82, 11, 1, "", "beam_search_diversity_rate"], [82, 11, 1, "", "early_stopping"], [82, 11, 1, "", "end_id"], [82, 11, 1, "", "frequency_penalty"], [82, 11, 1, "", "length_penalty"], [82, 11, 1, "", "max_attention_window_size"], [82, 11, 1, "", "max_new_tokens"], [82, 11, 1, "", "min_length"], [82, 11, 1, "", "min_p"], [82, 11, 1, "", "no_repeat_ngram_size"], [82, 11, 1, "", "num_beams"], [82, 11, 1, "", "num_return_sequences"], [82, 11, 1, "", "output_cum_log_probs"], [82, 11, 1, "", "output_log_probs"], [82, 11, 1, "", "output_sequence_lengths"], [82, 11, 1, "", "pad_id"], [82, 11, 1, "", "presence_penalty"], [82, 11, 1, "", "random_seed"], [82, 11, 1, "", "repetition_penalty"], [82, 11, 1, "", "return_dict"], [82, 11, 1, "", "sink_token_length"], [82, 11, 1, "", "stop_words_list"], [82, 11, 1, "", "temperature"], [82, 11, 1, "", "top_k"], [82, 11, 1, "", "top_p"], [82, 11, 1, "", "top_p_decay"], [82, 11, 1, "", "top_p_min"], [82, 11, 1, "", "top_p_reset_ids"], [82, 12, 1, "", "update"], [82, 11, 1, "", "use_beam_hyps"]], "tensorrt_llm.runtime.Session": [[82, 13, 1, "", "context"], [82, 13, 1, "", "context_mem_size"], [82, 13, 1, "", "engine"], [82, 12, 1, "", "from_engine"], [82, 12, 1, "", "from_serialized_engine"], [82, 12, 1, "", "infer_shapes"], [82, 12, 1, "", "run"], [82, 13, 1, "", "runtime"], [82, 12, 1, "", "set_shapes"]], "tensorrt_llm.runtime.TensorInfo": [[82, 11, 1, "", "dtype"], [82, 11, 1, "", "name"], [82, 12, 1, "", "numel"], [82, 11, 1, "", "shape"], [82, 12, 1, "", "squeeze"], [82, 12, 1, "", "view"]], "trtllm-serve-disaggregated": [[26, 16, 1, "cmdoption-trtllm-serve-disaggregated-c", "--config_file"], [26, 16, 1, "cmdoption-trtllm-serve-disaggregated-r", "--request_timeout"], [26, 16, 1, "cmdoption-trtllm-serve-disaggregated-t", "--server_start_timeout"], [26, 16, 1, "cmdoption-trtllm-serve-disaggregated-c", "-c"], [26, 16, 1, "cmdoption-trtllm-serve-disaggregated-r", "-r"], [26, 16, 1, "cmdoption-trtllm-serve-disaggregated-t", "-t"]], "trtllm-serve-disaggregated_mpi_worker": [[26, 16, 1, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", "--config_file"], [26, 16, 1, "cmdoption-trtllm-serve-disaggregated_mpi_worker-log_level", "--log_level"], [26, 16, 1, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", "-c"]], "trtllm-serve-serve": [[26, 16, 1, "cmdoption-trtllm-serve-serve-backend", "--backend"], [26, 16, 1, "cmdoption-trtllm-serve-serve-cluster_size", "--cluster_size"], [26, 16, 1, "cmdoption-trtllm-serve-serve-ep_size", "--ep_size"], [26, 16, 1, "cmdoption-trtllm-serve-serve-extra_llm_api_options", "--extra_llm_api_options"], [26, 16, 1, "cmdoption-trtllm-serve-serve-gpus_per_node", "--gpus_per_node"], [26, 16, 1, "cmdoption-trtllm-serve-serve-host", "--host"], [26, 16, 1, "cmdoption-trtllm-serve-serve-kv_cache_free_gpu_memory_fraction", "--kv_cache_free_gpu_memory_fraction"], [26, 16, 1, "cmdoption-trtllm-serve-serve-log_level", "--log_level"], [26, 16, 1, "cmdoption-trtllm-serve-serve-max_batch_size", "--max_batch_size"], [26, 16, 1, "cmdoption-trtllm-serve-serve-max_beam_width", "--max_beam_width"], [26, 16, 1, "cmdoption-trtllm-serve-serve-max_num_tokens", "--max_num_tokens"], [26, 16, 1, "cmdoption-trtllm-serve-serve-max_seq_len", "--max_seq_len"], [26, 16, 1, "cmdoption-trtllm-serve-serve-num_postprocess_workers", "--num_postprocess_workers"], [26, 16, 1, "cmdoption-trtllm-serve-serve-port", "--port"], [26, 16, 1, "cmdoption-trtllm-serve-serve-pp_size", "--pp_size"], [26, 16, 1, "cmdoption-trtllm-serve-serve-reasoning_parser", "--reasoning_parser"], [26, 16, 1, "cmdoption-trtllm-serve-serve-tokenizer", "--tokenizer"], [26, 16, 1, "cmdoption-trtllm-serve-serve-tp_size", "--tp_size"], [26, 16, 1, "cmdoption-trtllm-serve-serve-trust_remote_code", "--trust_remote_code"], [26, 16, 1, "cmdoption-trtllm-serve-serve-arg-MODEL", "MODEL"]]}, "objnames": {"0": ["c", "macro", "C macro"], "1": ["cpp", "type", "C++ type"], "2": ["cpp", "class", "C++ class"], "3": ["cpp", "function", "C++ function"], "4": ["cpp", "functionParam", "C++ function parameter"], "5": ["cpp", "member", "C++ member"], "6": ["cpp", "enum", "C++ enum"], "7": ["cpp", "enumerator", "C++ enumerator"], "8": ["cpp", "templateParam", "C++ template parameter"], "9": ["py", "module", "Python module"], "10": ["py", "class", "Python class"], "11": ["py", "attribute", "Python attribute"], "12": ["py", "method", "Python method"], "13": ["py", "property", "Python property"], "14": ["py", "function", "Python function"], "15": ["py", "pydantic_field", "Python field"], "16": ["std", "cmdoption", "program option"]}, "objtypes": {"0": "c:macro", "1": "cpp:type", "2": "cpp:class", "3": "cpp:function", "4": "cpp:functionParam", "5": "cpp:member", "6": "cpp:enum", "7": "cpp:enumerator", "8": "cpp:templateParam", "9": "py:module", "10": "py:class", "11": "py:attribute", "12": "py:method", "13": "py:property", "14": "py:function", "15": "py:pydantic_field", "16": "std:cmdoption"}, "terms": {"": [0, 1, 2, 3, 4, 6, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 24, 25, 27, 40, 41, 45, 46, 47, 53, 60, 64, 65, 66, 68, 70, 72, 73, 74, 75, 77, 78, 79, 82, 83, 84, 85, 87, 88, 90, 91, 92, 93], "0": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 20, 21, 23, 24, 25, 26, 29, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 55, 57, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 74, 75, 76, 77, 78, 79, 82, 83, 84, 86, 87, 89, 90, 94], "00": [14, 24, 50, 51, 52, 68, 69, 70, 87], "000": [18, 68], "0000": [68, 70], "0007503032684326172": 26, "001": 46, "0012": 68, "0017": 69, "003": 69, "0047": 87, "005": 69, "0070": 87, "0071": 87, "0096": 87, "00978": 85, "01": [23, 24, 50, 51, 52, 68, 69, 84, 88], "014": 21, "0158": 70, "016": 69, "0162": 72, "0165": 74, "017": 69, "02": [69, 88], "021": 69, "022": 69, "0235": 87, "0260": 87, "0273": 87, "028": 69, "0294": 87, "03": [74, 87, 88], "032": 24, "0339": 69, "03762": 77, "03961": 4, "04": [61, 62, 69, 86, 88, 89], "043": 69, "0449": 87, "0461": 18, "0463": 69, "05": [69, 77, 78, 79, 87, 88], "05100": 77, "0523": 87, "055": 69, "0554": 70, "0560": 87, "0563": 69, "06": [24, 68, 69, 77, 78], "0630": 87, "0669": 18, "068": 69, "0682": 87, "0689e": 68, "07": [23, 24, 69, 88], "0704": 70, "0713": 87, "0723": 87, "0732": 87, "0758": 18, "0772": 18, "0776": 87, "08": [24, 69, 74], "0804": 87, "082": 69, "0838": 69, "0881": 75, "089": 69, "09": [24, 87], "0903": 87, "0910": 87, "092": 69, "09353": 9, "0964": 69, "09685": 9, "097": 69, "09f": [0, 1], "0b": 2, "0e": 6, "0f": [0, 6, 65], "0rc1": 68, "0u": 1, "0x": 20, "0x0000000000000000": 88, "1": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 20, 21, 22, 23, 24, 25, 26, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 57, 59, 61, 62, 64, 65, 67, 68, 70, 71, 72, 74, 76, 77, 78, 79, 81, 82, 83, 86, 87, 89, 93], "10": [0, 8, 9, 10, 18, 23, 24, 26, 32, 34, 39, 46, 54, 59, 62, 65, 68, 69, 70, 72, 75, 77, 84, 86, 87], "100": [0, 8, 18, 26, 34, 51, 67, 68, 70, 83], "1000": [0, 67, 68, 69, 70], "10000": [77, 78, 79], "1003": 88, "101": 8, "101230": 46, "101978": 69, "102": [8, 20], "1024": [1, 6, 13, 18, 21, 23, 25, 32, 39, 46, 49, 65, 68, 69, 70, 74, 77, 78, 87], "103": 8, "104": 88, "10438": 85, "1045": 87, "1047": 68, "1050": 87, "1051": 70, "1059": 68, "106563": 69, "1072": 87, "107501": 69, "10764": 48, "10774": 0, "1079": 17, "108": 69, "1082": 87, "10858": 32, "10b": [64, 77, 88], "10m": 20, "11": [0, 9, 10, 18, 21, 23, 59, 68, 69, 72, 77, 87], "11023": 68, "110804": 69, "110b": 88, "111": [20, 24], "111302": 69, "111618": 69, "111668": 69, "1118": 88, "1123": 88, "1134": 84, "1135": 87, "1141": 87, "1148": 88, "11489": 18, "11490": 68, "1151": 18, "115716": 69, "1160": [26, 33], "117": 69, "1178": 68, "1181": 88, "1183": 88, "119": 68, "11943": 68, "11947": 32, "1196": 18, "11b": [86, 88], "12": [0, 9, 13, 20, 24, 32, 59, 61, 62, 68, 69, 72, 74, 77, 87], "1207": 48, "1212": 87, "121847": 68, "1219": 18, "122": 68, "1225": 77, "12288": 68, "123": [26, 34, 35], "1234": [65, 79], "1239": 88, "1242": 88, "1248": 88, "125": 68, "1252": [17, 68], "1256": 88, "125m": [10, 13], "126": 68, "1267": 88, "127": 77, "1272": 87, "128": [0, 1, 5, 8, 9, 11, 14, 18, 19, 20, 21, 22, 23, 24, 26, 32, 34, 35, 44, 51, 65, 68, 69, 88], "1284": 88, "1287": 72, "1290": 87, "1291504": 70, "1293": 17, "12945": 18, "129498": 18, "13": [5, 9, 22, 59, 68, 69, 70, 77, 87], "1300": 40, "13044": 48, "131072": [68, 70], "13195": 68, "132": [68, 69], "1323": 88, "1328": 88, "1329": 88, "133": 88, "13368": 68, "1337": 88, "1341": 18, "1343": 88, "1344": 88, "13525": 68, "13598": 68, "1363": 48, "137": 68, "1378": 87, "139": 69, "1392": 88, "13b": 20, "14": [9, 13, 23, 59, 68, 69, 72, 74, 75, 87], "140g": 17, "141": 21, "1418": 68, "141gb": [19, 69], "1424": 88, "1436": [18, 88], "1437": 87, "144": 72, "1446": 88, "1447": 88, "14480": 68, "1449": 88, "145": [74, 75], "1459": 87, "146": [74, 75], "1467": 88, "147": [70, 72, 74, 75], "1480": 88, "1486": 88, "149": [87, 88], "15": [9, 24, 59, 68, 69, 75, 77, 87], "150": 67, "1500": 69, "15043": 32, "1514": 88, "1529": 88, "1534": 88, "1535": 88, "1536": 18, "1537": 88, "1539": 88, "154": 24, "1552": 88, "1556": 87, "15585": 68, "1562": 88, "1564": [70, 74, 75], "158": 18, "1583": 88, "1584": 18, "1585": 70, "15889": 48, "1589": 88, "1590": 88, "1597": 72, "16": [0, 5, 9, 10, 14, 18, 20, 23, 24, 26, 29, 31, 50, 51, 52, 59, 60, 68, 69, 70, 71, 77, 78, 79, 84, 85, 87], "160": 88, "1607": 68, "161": [26, 33, 68], "1625": 72, "1626": 88, "163": 19, "1637": 88, "16384": [72, 74], "164": 24, "1642": 88, "1650": 88, "1660": 88, "1669": 88, "167": [68, 69], "1672": 87, "1674": 88, "1675": 88, "1676": 88, "168": 24, "16e": 86, "16x": 84, "17": [0, 2, 9, 18, 59, 68, 69, 74, 87, 89], "1706": 77, "1721": 87, "1723": 88, "17233": 18, "173": 24, "1732": 88, "17323": 85, "1738": 88, "174": 69, "1741966075": 83, "1742": 88, "17453": 25, "17453v3": 1, "175": 69, "175b": 21, "176": 68, "176064": 18, "1762": 88, "1799": 88, "17b": 86, "18": [2, 9, 59, 66, 68, 69, 87], "180": [24, 84], "180000000": 0, "180b": [23, 68], "1811": 48, "1815": 88, "181540": 18, "182": 69, "1822": 32, "183": 69, "1834": 88, "184": 69, "185": [20, 68], "1851": 88, "18527": 32, "18533": 48, "18563": 68, "1861": 75, "1866": 75, "1885": 70, "1886": 88, "1889": 48, "1897": 88, "19": [2, 18, 59, 69, 75, 87], "1900": 48, "1909": 88, "191": 69, "192": 19, "1921": 18, "1926": 88, "1937": 88, "1939": 88, "1944": 74, "1953": 88, "1959": 68, "198": 24, "1985": 88, "1987": 88, "1993": 87, "1999": 88, "1_405b": 14, "1_70b": 14, "1b": [26, 29, 31, 34, 36, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 53, 54, 55, 57, 61, 62, 64, 83, 89], "1d": [5, 77, 82], "1e": [13, 77, 78, 79], "1e20f": 1, "1g": 87, "1gb": 2, "1k": [18, 24], "1m": 75, "1st": [20, 77, 84], "1u": [0, 1], "1x": 24, "1xh200": 19, "1ytic": 88, "2": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 17, 19, 20, 21, 23, 24, 26, 38, 39, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 59, 61, 62, 64, 65, 68, 69, 71, 72, 74, 75, 77, 79, 82, 85, 86, 87, 93], "20": [1, 6, 10, 11, 26, 55, 57, 68, 69, 70, 74, 77, 82, 87], "200": [21, 82], "2000": 69, "20000": 69, "2017": 74, "2018": 88, "2023": [19, 87], "2024": 24, "2025": [18, 24, 68], "2028": 88, "203": 69, "2033": 75, "2039": 88, "204": [24, 69], "2040": 88, "2044": [74, 75], "2045": 74, "2048": [13, 18, 19, 21, 22, 25, 44, 65, 68, 69, 70, 72, 73, 74, 75, 79, 82, 87, 88], "2056": 88, "206": 69, "20627": 32, "20685": 68, "2079": 87, "208": 69, "2081": [72, 74, 88], "2087": 88, "2089": 69, "209": 69, "20b": 88, "21": [10, 23, 24, 69, 74, 87, 88], "2101": 4, "2102": 69, "2106": 9, "2107": [48, 87], "210g": 17, "211": 24, "2113": 88, "2135": 88, "21367": 48, "2152": 88, "2158": 69, "2168": 18, "2169": 88, "21747": 68, "2176": 69, "21764": 68, "2182": 88, "2191": 88, "22": [28, 69, 77, 87], "22000": 69, "22056": 68, "221": 68, "2210": 85, "2211": [77, 85], "2219": 88, "22213": 68, "2225": 87, "2232": 88, "224": 78, "2243": 88, "2263": 88, "227": 22, "2288": 88, "2294": 88, "23": [68, 69, 87, 88], "2305": 87, "2306": 85, "2309": [1, 25], "232": 22, "2337": 48, "2352": 88, "2357": 88, "236": 24, "2366": 88, "2370": 88, "2373": 88, "2379": 88, "2388": 88, "239": 24, "2397": 68, "24": [0, 61, 62, 69, 87, 88, 89], "240": 69, "2401": 0, "2402": 9, "24189": 69, "2419": 88, "242": 69, "2425": 88, "2439": 88, "245": 24, "2458": 88, "2461": 74, "2466": 74, "2473": 88, "2474": [72, 74], "2484": 88, "2485": 88, "2487": 69, "249": 24, "25": [22, 24, 68, 69, 86, 88], "250": [18, 24], "2500": 69, "25032": 68, "253": [24, 69], "2552": 88, "256": [1, 18, 19, 22, 54, 65, 68, 69, 77, 87, 88], "25603": 68, "2573": 88, "2581": [72, 74], "2590780": 68, "259840": 84, "26": [68, 69, 72, 83], "260": 69, "2602": 32, "2628": [74, 75], "263": [19, 32, 48], "2640": 75, "2649": 87, "2671": 18, "2677": 88, "26778": 68, "2679": 72, "2685": 88, "2688": 48, "2691": 88, "27": [69, 88], "270": 69, "2712": 88, "274": [18, 88], "2742": 70, "275": 88, "27556": 48, "276": 69, "278": [32, 48, 69], "2782": 88, "2787": 88, "2796": 88, "28": [24, 68, 69, 87], "2820": 87, "2826700": 18, "28390": 68, "287113": 68, "288": 88, "29": [69, 84], "292": 69, "2939": 87, "294": 69, "297": 32, "29889": 48, "29892": 32, "299": [24, 68], "29962": 32, "2998": 87, "2b": [17, 59, 68], "2d": [10, 77, 78, 85], "2k": [18, 24], "2m": 75, "2nd": 77, "2u": 1, "2x": [20, 21], "3": [0, 1, 3, 5, 7, 8, 9, 15, 19, 20, 21, 23, 24, 39, 41, 43, 47, 49, 53, 54, 59, 61, 62, 64, 65, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 82, 83, 87, 88, 89, 90], "30": [0, 10, 18, 24, 65, 69, 70, 72, 75, 77, 84], "300": [22, 68], "3000": [68, 69], "30000": 69, "30065": 68, "3019": 68, "3021": 18, "3022": 68, "303": 21, "3031": 74, "304": [32, 48], "3040": [70, 74, 75], "306": 32, "3072": 18, "30990": 68, "30b": 23, "30x": 23, "31": [69, 70, 74, 75], "311": 69, "3132": 68, "315": [24, 69], "318": 69, "32": [1, 5, 8, 18, 20, 21, 25, 32, 48, 65, 68, 69, 70, 77, 78, 79, 82, 83, 84, 85, 87, 88, 89], "3201": 70, "321": 68, "322": [32, 48], "3276": [70, 74, 75], "32768": 77, "3291": 87, "32b": 88, "32k": 88, "32x": 23, "33": [69, 87], "332": 69, "3328": 87, "3338": 70, "338": [24, 32, 48], "3389": 72, "34": [18, 69], "340": [24, 69], "341": 21, "3442": 87, "3445": 87, "3452": 87, "3476": 18, "349": 21, "34b": 88, "35": [0, 65, 69], "351": 69, "3555": 87, "35611": 18, "357": 69, "36": [24, 69, 71, 72], "3671": 68, "368": 24, "37": 68, "370": 69, "371": 69, "374": 69, "375": 69, "3763": 24, "379": 69, "38": [68, 69], "384": [18, 69], "3863": 69, "387": 69, "387b12598a9e": 68, "3885": 18, "3887": 87, "39": [24, 69], "3914": 69, "3936": 68, "3977": 87, "399": 69, "3_1": 86, "3_3": 86, "3b": [30, 35, 56], "3d": [5, 77, 82], "3rd": 77, "3u": 1, "3x": [23, 24], "4": [0, 1, 2, 7, 8, 9, 10, 14, 17, 21, 23, 24, 26, 32, 39, 44, 48, 49, 50, 51, 52, 59, 65, 68, 69, 70, 72, 73, 74, 75, 76, 77, 79, 82, 83, 84, 85, 86, 87, 88], "40": [6, 69, 72, 77, 88], "403": 88, "405": 48, "405b": [68, 71], "4060": 84, "4066": 32, "408": 69, "4089": 75, "4096": [19, 32, 68, 69, 72, 77, 78, 82], "40b": 23, "40gb": 25, "40x": 23, "41": 69, "41020": 68, "411": 68, "4115": 24, "4117e": 68, "4133": 75, "41375": 68, "414": 18, "41607": 68, "4168": 18, "4192": 87, "42": [47, 68, 69], "4203099703668305365": 46, "4224": 69, "4248": 72, "4265": 68, "427": [48, 68, 69], "4280": 24, "43": [69, 83, 84], "433": 69, "437": 69, "438": 69, "44": [69, 84], "4408": 32, "442": 69, "4439": 68, "4451": 18, "4456": 69, "447": 69, "448": 69, "449": 88, "4493": [18, 74, 75], "4497": 69, "44x": 23, "45": [8, 69, 86, 88], "450": 69, "45000000000": 8, "453": 69, "4566": 69, "459": 69, "46": 23, "462": 69, "463": 69, "4653": 32, "4656": 69, "466": 69, "4667": 69, "47": [23, 72], "4701": 68, "471": 69, "472": 32, "475": 69, "477": 69, "478": 88, "47x": 23, "48": [69, 72, 84, 88], "481": [20, 69], "482": 88, "488": 69, "49": [69, 72], "49152": 18, "495": 69, "4963": 68, "49b": 86, "4b": 88, "4bit": 19, "4u": 1, "4x": [19, 20, 21], "5": [0, 1, 8, 9, 10, 11, 13, 19, 20, 21, 23, 24, 30, 35, 39, 40, 46, 49, 56, 64, 65, 68, 69, 74, 77, 79, 82, 86, 87, 88], "50": [0, 23, 40, 65, 68, 69, 88], "500": [24, 69], "5000": 69, "500000": 79, "5001": 48, "5007": 32, "500m": 23, "50272": 13, "505143404006958": 26, "5064": 69, "5073": 87, "51": 69, "512": [1, 9, 11, 21, 22, 65, 68, 69, 72, 74, 79], "5120": 18, "512mb": 2, "514": 69, "518": [32, 69], "51b": [86, 88], "51x": 23, "52269": 69, "524": 69, "525": 69, "526": [48, 69, 88], "52667": 69, "529": 69, "5299": 72, "53": [68, 74, 75], "5305": 72, "531": 69, "54": [23, 69], "540": 68, "543": 69, "544": 69, "5496": 72, "5497": 69, "55": [23, 68, 69], "5500": 69, "5510": 68, "5514": 68, "5530": 69, "554": 69, "557": 69, "559": 69, "56": [23, 69], "560": 19, "562": [9, 11], "56401920000": 26, "565": 69, "567": 69, "568": [68, 69], "57": [68, 69], "571": 69, "572": 69, "5739": 18, "5742": [72, 74], "579": 69, "58": [24, 69, 74], "580": 69, "5821": 69, "5830": 87, "5874": 87, "5877": 72, "5879": 87, "588": 69, "58x": 24, "59": 68, "590": [32, 69], "5918": 87, "5942": 18, "5957": 87, "5976": 72, "598": 69, "5980": 72, "5b": 88, "5th": 77, "5u": 1, "5x": [20, 23, 24], "6": [0, 1, 6, 8, 9, 10, 21, 23, 24, 26, 39, 49, 65, 69, 77, 82, 86, 87, 88], "60": [0, 69], "600": 27, "6000": 68, "602": 69, "6049": 72, "6059": 68, "6064": 87, "608": 69, "61": 69, "610": 69, "6100": 18, "6157": 87, "618": 69, "62": [24, 69, 74], "6255": 87, "626": 32, "6299": 87, "63": [39, 49, 60, 68, 69, 74, 79, 84], "630": 69, "63266": 70, "63307": 70, "63308": 70, "63331": 70, "63374": 70, "634": 69, "63456": 70, "6345624": 70, "6372": 72, "639": 88, "64": [0, 1, 5, 6, 13, 18, 20, 21, 25, 30, 35, 53, 56, 69, 74, 77, 78, 79, 84, 88], "640": [19, 69], "6452": 75, "6475": 74, "649": 88, "64x": 24, "65": [62, 69], "65024": 87, "6523": 75, "653": 69, "654": 21, "6550": 72, "6554": 74, "656": 69, "657": 69, "659": 69, "6591": 68, "66": [24, 69], "661": 69, "6628": [74, 75], "6678": 84, "6684": 75, "6695": 84, "67": [23, 24, 69], "6701": 18, "671": 18, "67108864": 60, "673": 88, "675": 68, "6753e": 68, "6769": 74, "679": 20, "68": [23, 24, 69, 75], "682": 69, "6825": 68, "683": 69, "684": 24, "685": 69, "6852": [72, 74], "686": 69, "6862": 68, "6890": 87, "69": [23, 24, 69, 75, 83], "6925": 68, "6938": 32, "695": 88, "696": 69, "6975": 72, "6976": [70, 74, 75], "698": 69, "6a": 19, "6b": [20, 68, 77, 88], "6x": 21, "7": [0, 1, 8, 9, 19, 20, 23, 24, 39, 49, 59, 60, 61, 62, 68, 69, 70, 77, 82, 87], "70": [0, 23, 75, 84], "700": 27, "7000": 68, "701": 88, "7031": 72, "704": 69, "705": 88, "706": 69, "7063": 68, "707": 69, "7072": 69, "709": 68, "7090": 87, "70b": [5, 17, 21, 23, 49, 70, 72, 73, 74, 75, 76, 86, 88], "70g": 17, "71": [24, 68, 69], "711": 69, "712": 69, "7134": 87, "7136": 70, "714": 69, "7144": 87, "7168": 24, "717": 69, "7187": 69, "7188": 18, "72": [69, 71], "722": 69, "727": 69, "72b": [86, 88], "73": [24, 69], "732": 69, "734": 69, "736": 69, "737": 69, "7382": 69, "739": 88, "74": [24, 69], "741": [69, 88], "742": 69, "745": 69, "7456": 18, "74561": 18, "747": 69, "7480": 70, "75": [23, 68, 88], "750": [21, 69], "7502": 70, "7520": 18, "755": 27, "7584": 18, "75903": 69, "76": 69, "7607": 74, "7621": 69, "7638": [70, 74, 75], "767": 69, "768": [13, 78], "77": 69, "772": 69, "7743": 70, "7770": 70, "78": [24, 69, 72], "780": 68, "7842": 72, "78509": 69, "7876": 74, "79": [68, 84], "7900": 87, "7933": 74, "794": [69, 88], "7949": 87, "7977": 72, "7a": 19, "7b": [9, 10, 11, 23, 26, 39, 49, 68, 69, 83, 86, 88], "7x": [20, 24], "8": [0, 1, 5, 8, 9, 13, 14, 17, 18, 19, 21, 22, 23, 24, 25, 26, 32, 33, 36, 38, 39, 41, 42, 43, 44, 45, 47, 49, 50, 51, 52, 54, 59, 61, 62, 65, 68, 69, 70, 71, 72, 76, 77, 78, 79, 83, 84, 85, 87], "80": [0, 6, 21, 24, 60, 69, 88], "800": [19, 69, 88], "8000": [26, 29, 30, 31, 33, 34, 35, 55, 56, 57, 83], "8002": 68, "8005": 69, "803": 19, "8048": 68, "80gb": [20, 23, 25, 69, 70, 72, 73], "81": [24, 69, 72], "810": 69, "8149": 87, "8179": 87, "819": 21, "8192": [25, 65, 68, 69, 70, 74, 77, 78, 87, 88], "82": [24, 69, 72], "820": 68, "8212": 1, "8218": 87, "822": 69, "8225": 72, "825": 88, "8259": 68, "83": 69, "8307": 75, "8351": 68, "838": 69, "84": [24, 69], "840": 69, "841": 69, "8441": 68, "85": [18, 23, 68, 69, 88], "850": 69, "851": 69, "854": 69, "86": [60, 69], "863": 68, "866": 69, "867": 69, "8672": 87, "87": [23, 69], "8779": 87, "88": [69, 72, 75], "8804": 70, "8828": 87, "8841": 72, "89": [23, 24, 60, 69, 86], "893": 69, "8932": 68, "8958": 75, "896": [48, 69], "8a": 22, "8b": [41, 49, 64, 68, 83, 86, 89], "8bit": 20, "8tb": 21, "8x7b": [4, 68, 86, 88], "8xb200": 24, "8xh100": 22, "8xh200": 19, "9": [0, 1, 9, 10, 17, 20, 24, 39, 49, 54, 59, 69, 72, 77, 87], "90": [0, 18, 60, 65, 68, 69, 70, 72, 76, 84], "9007": 18, "9028": 87, "907": 20, "9087": 75, "91": 69, "910": 69, "9101": 69, "911": 69, "9115": 75, "912656": 18, "913": 69, "9184": 72, "92": [24, 69], "920": 69, "9203": 72, "9214": 69, "924": 13, "925": 69, "9274": 70, "93": 69, "935": 88, "9353e": 70, "9379": 18, "94": 69, "94022": 69, "941": [19, 22], "943": 48, "944": 69, "946": 19, "947": 69, "9494": 74, "95": [26, 33, 36, 38, 39, 41, 42, 43, 44, 45, 47, 49, 54, 61, 62, 69, 70, 76, 83], "9521": 87, "953": 69, "9537": 72, "956": 69, "957": 69, "96": [19, 24, 69, 72, 88], "960": 19, "961": 69, "9623": 74, "963": 69, "9639": 69, "96583": 69, "967": 88, "9692": 87, "97": [68, 69, 72], "970": 69, "98": 69, "983": 88, "987": 88, "99": [8, 24, 27, 69], "990": 69, "991": 69, "992": 88, "9928": 75, "9938": 18, "9982": [74, 75], "9x": [21, 22], "A": [0, 1, 2, 3, 5, 6, 9, 10, 13, 14, 17, 18, 23, 24, 47, 50, 51, 52, 53, 65, 67, 68, 69, 77, 82, 88, 90, 92], "AND": 77, "And": [10, 17, 77, 78, 84], "As": [4, 5, 7, 9, 10, 14, 16, 32, 72, 75, 76, 77, 84, 85, 87, 92, 93], "At": [12, 53, 72, 78, 84], "But": [5, 66], "By": [0, 1, 2, 6, 10, 24, 32, 60, 68, 72, 75, 77, 87, 92], "For": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 32, 36, 47, 50, 51, 52, 58, 60, 64, 68, 69, 70, 71, 72, 74, 75, 76, 77, 82, 83, 84, 87, 88, 90, 91, 92, 93, 94], "If": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 17, 23, 25, 26, 27, 28, 60, 61, 62, 64, 65, 66, 68, 70, 71, 72, 74, 75, 76, 77, 79, 82, 84, 86, 87, 88, 90, 92, 93, 94], "In": [0, 1, 2, 7, 10, 14, 15, 17, 18, 20, 23, 24, 28, 32, 49, 53, 59, 60, 68, 69, 70, 71, 72, 74, 75, 77, 83, 84, 85, 86, 87, 88, 92, 93, 94], "It": [0, 1, 3, 5, 6, 7, 9, 10, 12, 14, 15, 16, 18, 19, 22, 23, 24, 25, 32, 46, 53, 60, 65, 66, 68, 69, 72, 73, 74, 75, 76, 77, 83, 85, 87, 90, 91, 92, 94], "Its": [5, 77, 92], "NOT": 77, "No": [0, 2, 8, 53, 68, 70], "Not": [1, 23], "ON": [68, 72, 74, 75], "OR": 77, "Of": [24, 88], "On": [5, 8, 60, 62, 67, 71, 75, 77, 88], "One": [2, 13, 14, 74, 77, 87, 91], "Or": [77, 82, 89], "That": [3, 5, 6, 8, 14, 66, 72, 77], "The": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54, 59, 60, 61, 62, 64, 65, 67, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, 93, 94], "Then": [9, 17, 26, 27, 68, 70, 77, 90, 93], "There": [2, 5, 6, 7, 8, 9, 13, 17, 21, 24, 32, 60, 62, 64, 77, 80, 84, 85, 87, 88, 91, 92, 93, 94], "These": [2, 10, 17, 19, 21, 22, 24, 32, 68, 70, 71, 78, 80, 83, 88], "To": [2, 3, 5, 8, 9, 10, 11, 14, 15, 16, 17, 18, 21, 24, 60, 64, 65, 66, 67, 68, 69, 72, 74, 75, 76, 77, 83, 84, 85, 88, 89, 90, 92, 93, 94], "Will": 0, "With": [5, 6, 10, 14, 27, 32, 44, 59, 68], "_": [0, 3, 15, 80], "__all__": 90, "__call__": 47, "__init__": [7, 12, 14, 15, 47, 65, 68, 87, 88, 90, 92, 94], "__main__": [36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 54, 61, 62, 64, 70, 72, 75, 76, 83, 88, 89, 90], "__name__": [36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 54, 61, 62, 70, 72, 75, 76, 83, 88, 89, 90], "__post_init__": 88, "__repr__": 88, "_capac": 1, "_context_logits_auto_en": 65, "_cpp_gen": 3, "_create_tensor": 14, "_explicitly_disable_gemm_plugin": 80, "_generation_logits_auto_en": 65, "_handl": 1, "_mark_output": 87, "_note": 5, "_path": 18, "_postproc_param": 65, "_postprocess_result": 65, "_return_log_prob": 65, "_run": 87, "_runtim": 82, "_str_to_trt_dtype_dict": 77, "_torch": [46, 68, 88, 89, 90, 91, 92], "_unsign": 1, "_util": 77, "a10": 25, "a100": [6, 17, 25], "a10g": 25, "a2": 88, "a30": 25, "a40": 25, "a8": 85, "a_": 77, "a_1": 77, "a_2": 77, "a_n": 77, "a_sf": 77, "aarch64": 86, "ab": [9, 25, 77, 85], "abbrevi": 26, "abi": [60, 88], "abil": [66, 68], "abl": [5, 20, 24, 62, 68, 74, 77, 88], "abnorm": 88, "abort": [65, 88], "about": [0, 1, 3, 17, 18, 19, 20, 22, 23, 46, 53, 54, 59, 68, 70, 72, 73, 75, 77, 83, 84, 87, 88], "abov": [2, 9, 14, 17, 18, 23, 32, 60, 68, 69, 70, 72, 75, 84], "absenc": 6, "absorb": 24, "abstract": [75, 78], "ac": 88, "acc": 77, "acceler": [5, 10, 20, 21, 22, 23, 25, 66], "accept": [0, 1, 10, 18, 32, 41, 42, 43, 44, 45, 60, 65, 70, 72, 77, 82, 83, 86, 88, 92], "accept_length": 82, "acceptancer": 0, "acceptancethreshold": 0, "acceptedlen": 1, "acceptedlengthscumsum": 1, "acceptedpath": 1, "acceptedpathid": 1, "acceptedtoken": 1, "acceptedtokenslen": 1, "access": [3, 28, 40, 68, 70, 77, 83, 88], "accessor": 1, "accommod": [4, 91, 93], "accomplish": 71, "accord": [5, 15, 54, 77, 78, 92], "accordingli": 15, "account": [14, 18, 27, 50, 51, 52, 60], "accumul": [0, 5, 6, 25, 65, 77, 82, 83], "accur": [19, 40, 68, 70, 88], "accuraci": [19, 24, 25, 72, 76, 77, 85, 88], "achiev": [2, 10, 18, 19, 23, 24, 60, 69, 70, 72, 74, 76, 90], "across": [2, 4, 5, 6, 7, 14, 15, 21, 24, 26, 69, 71, 72, 74, 75, 77, 82], "act": 24, "act_fn": 78, "act_typ": [14, 77], "action": 49, "activ": [0, 1, 5, 7, 14, 18, 19, 20, 23, 24, 25, 71, 77, 85, 86, 88, 94], "activation_scaling_factor": 13, "activationtyp": [14, 77], "active_request": 94, "actual": [7, 10, 18, 23, 24, 25, 72, 74, 75, 76, 88, 93], "ad": [1, 5, 6, 7, 8, 10, 11, 17, 28, 59, 67, 71, 74, 75, 77, 79, 82, 88, 89, 91], "ada": [5, 23, 54, 60, 66, 72, 86, 88], "adalayernorm": 78, "adalayernormcontinu": 78, "adalayernormzero": 78, "adalayernormzerosingl": 78, "adapt": [0, 9, 36, 37, 65, 77, 78, 88, 90], "adapter_s": 9, "adapters": 1, "add": [1, 3, 5, 7, 9, 12, 13, 14, 17, 27, 28, 47, 49, 60, 64, 65, 68, 70, 72, 75, 77, 82, 87, 88, 90, 93], "add_activ": 14, "add_argu": 49, "add_bias_linear": 79, "add_generation_prompt": 24, "add_input": 77, "add_output": 77, "add_padding_request": 93, "add_qkv_bia": 79, "add_rmsnorm": 24, "add_sequ": 82, "add_special_token": [24, 65, 82, 88], "addcumlogprob": 88, "added_kv_proj_dim": 78, "added_proj_bia": 78, "addit": [0, 5, 6, 9, 10, 14, 17, 21, 26, 32, 40, 60, 65, 68, 69, 71, 72, 74, 77, 78, 85, 86, 87, 88, 92, 93], "addition": [2, 68, 70, 72, 75, 90, 92], "additional_model_output": 65, "additional_opt": 52, "additionalmodeloutput": [0, 3, 65], "additionaloutput": [0, 3], "address": [1, 15, 18, 23, 24, 64, 75, 84, 88], "addresswiths": 1, "adequ": 78, "adher": 40, "adjust": [50, 65, 68, 70, 84, 94], "admin": 62, "adopt": [6, 17], "advanc": [10, 14, 22, 24, 25, 38, 41, 42, 44, 45, 60, 65, 77, 88, 92], "advantag": [6, 66], "advers": [19, 25], "advertis": 68, "advis": 2, "affect": [17, 18, 25, 70, 72, 74, 75, 84], "affin": 78, "after": [0, 1, 3, 5, 7, 8, 9, 10, 14, 15, 24, 25, 26, 27, 46, 49, 60, 64, 65, 68, 72, 74, 75, 76, 77, 78, 80, 83, 84, 88, 92, 94], "again": [14, 70, 72, 75, 87], "against": [60, 68], "agent": 21, "aggress": [13, 72, 76], "agre": [64, 83], "ahead": [0, 5, 10], "ai": [18, 20, 24, 26, 33, 36, 38, 39, 41, 42, 43, 44, 45, 49, 54, 61, 62, 66, 67, 70, 76, 77, 83, 86, 88, 89], "aidc": 88, "aim": [4, 13, 18, 24, 66, 68, 70, 72, 88], "ainsli": 19, "air": 88, "aka": 77, "akhoroshev": 88, "al": 19, "albeit": 10, "alessionetti": 88, "algorithm": [0, 5, 6, 10, 13, 14, 17, 23, 24, 65, 68, 72, 77, 88], "alia": [78, 79], "alibi": 77, "alibi_bias_max": [77, 78], "alibi_scal": 77, "alibi_slop": 77, "alibi_with_scal": 77, "align": [68, 88, 94], "align_corn": 77, "all": [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 14, 15, 17, 18, 21, 24, 47, 50, 51, 52, 53, 60, 65, 66, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 82, 83, 84, 85, 86, 87, 88, 92, 93, 94], "all_reduce_param": [77, 78], "allbitset": [0, 1], "allgath": [14, 25, 75, 77, 88], "allgeneratedtoken": 0, "alllayersdrafttokenid": 1, "alllayersdrafttokenidspredecessor": 1, "alllayersscor": 1, "alloc": [0, 1, 2, 5, 8, 26, 32, 65, 76, 77, 82, 84, 87, 88, 91, 92, 93, 94], "allocateipcmemori": 1, "allocatespeculativedecodingbuff": 1, "allocnewblock": 0, "allocnewblocksperrequest": 0, "alloctotalblock": 0, "alloctotalblocksperrequest": 0, "allot": 0, "allottedtimem": [0, 88], "allow": [0, 1, 2, 3, 5, 6, 8, 10, 13, 19, 22, 25, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 77, 80, 87, 88, 91, 94], "allowed_token_id": 47, "allreduc": [14, 24, 25, 75, 77, 88], "allreducebuff": 1, "allreducefusionkernel": 24, "allreducefusionop": 77, "allreduceparam": [77, 78], "allreducestrategi": 77, "almost": [14, 72, 74, 84], "alon": 4, "along": [5, 10, 16, 60, 77, 88], "alpaca": 9, "alpha": [65, 77, 78, 88], "alphabet": 77, "alreadi": [0, 5, 7, 8, 16, 18, 24, 65, 72, 74, 76, 77, 88, 90, 93], "also": [0, 2, 3, 5, 7, 10, 13, 14, 15, 16, 17, 18, 21, 22, 23, 24, 25, 26, 32, 44, 46, 60, 64, 68, 69, 70, 71, 72, 73, 74, 77, 78, 83, 84, 85, 88, 90, 91, 92, 93], "altair": 88, "alter": [3, 7], "altern": [3, 24, 47, 60, 90, 91], "although": [7, 14, 68, 72, 75], "alwai": [0, 1, 3, 5, 6, 8, 13, 14, 17, 48, 65, 74, 75, 77, 87], "always_share_across_beam": 82, "am": [38, 41, 42, 44, 45, 47, 54, 70, 76, 82], "ambigu": 1, "amd": 88, "amen": [0, 3, 65], "among": [28, 77], "amongst": 77, "amount": [0, 8, 14, 25, 65, 68, 74, 76, 82, 84, 87], "amper": [20, 60, 66, 86, 88], "an": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 21, 23, 24, 25, 26, 32, 38, 40, 41, 42, 43, 44, 45, 47, 54, 60, 62, 64, 65, 66, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 82, 83, 84, 85, 87, 88, 90, 91, 92, 93, 94], "analysi": [7, 24, 59, 84], "analysispatternmanag": 7, "analyt": 20, "analyz": [7, 70], "ani": [0, 1, 2, 3, 7, 10, 15, 17, 18, 26, 47, 60, 64, 65, 66, 68, 69, 74, 75, 76, 77, 79, 82, 87, 90, 91, 92], "announc": [18, 19, 20, 22], "anoth": [0, 1, 5, 7, 9, 17, 20, 24, 26, 74, 77, 87, 92, 94], "answer": 40, "antialia": 77, "antonin": [38, 41, 42, 44, 45], "anybitset": [0, 1], "anyth": [53, 69], "aotman": 88, "apart": 32, "api": [2, 6, 8, 10, 12, 13, 14, 16, 18, 32, 33, 44, 50, 51, 52, 59, 60, 66, 67, 68, 69, 72, 73, 75, 76, 77, 84, 87, 89], "api_kei": [26, 55, 56, 57], "app": [60, 88], "appar": 66, "appear": [0, 5, 6, 46, 62, 65, 77, 87, 88], "append": [47, 54, 67, 77, 94], "append_paged_kv_cach": 92, "appl": 88, "appli": [0, 2, 3, 5, 7, 9, 10, 13, 14, 15, 24, 25, 60, 65, 66, 68, 77, 78, 82, 85, 88, 92], "applic": [8, 10, 20, 23, 24, 26, 29, 30, 31, 62, 64, 66, 67, 83, 87, 88, 94], "apply_batched_logits_processor": [47, 65], "apply_chat_templ": [24, 40], "apply_llama3_sc": 77, "apply_query_key_layer_sc": [78, 79], "apply_residual_connection_post_layernorm": 79, "apply_rotary_pos_emb": 77, "apply_rotary_pos_emb_chatglm": 77, "apply_rotary_pos_emb_cogvlm": 77, "apply_silu": 77, "applybiasropeupdatekvcach": 88, "applyrop": 24, "approach": [2, 4, 7, 8, 10, 24, 64, 68, 76], "appropri": [23, 32, 87], "approxim": [60, 78], "apt": [18, 27, 60, 61, 62], "ar": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 29, 30, 38, 40, 41, 42, 44, 45, 47, 48, 49, 50, 51, 52, 53, 55, 56, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94], "arang": 77, "arbitrag": 68, "arbitrari": [15, 88], "architectur": [2, 4, 6, 8, 13, 20, 60, 66, 79, 82, 86, 88, 89], "arctic": [86, 88], "area": 54, "aresult": 32, "arg": [7, 17, 26, 49, 65, 78, 79, 82, 88], "arglist": 7, "argmax": 77, "argpars": 49, "argument": [2, 3, 18, 26, 32, 44, 47, 60, 64, 65, 68, 71, 77, 84, 88, 92], "argumentpars": 49, "aris": 60, "arithmet": 14, "armor": 46, "around": [1, 13, 17, 66, 70, 75], "arrai": [0, 1, 65, 77, 82], "arrayview": [0, 1], "arriv": [0, 4], "arrivaltim": 0, "arrow": 77, "art": [18, 24], "articl": [5, 10, 24], "artifici": 66, "artist": 54, "arxiv": [0, 1, 4, 9, 25, 77, 85], "as_dtyp": 77, "as_lay": 7, "as_shap": 77, "ascii": 77, "asciichar": 1, "ask": [46, 53, 87], "aspect": 5, "assembl": [14, 16], "assert": [7, 77, 87, 88, 94], "assert_valid_quant_algo": 79, "assign": [0, 2, 17, 78, 80, 90], "assist": [6, 26, 29, 30, 40, 55, 56, 64, 83], "assistant_model": 6, "associ": [1, 3, 4, 9, 60, 70, 77], "asssembl": 10, "assum": [1, 3, 8, 9, 10, 11, 18, 65, 68, 77, 79, 82], "assumpt": [10, 25], "async": [32, 42, 43, 65, 68, 82], "asynchron": [1, 3, 32, 36, 37, 65], "asyncio": [42, 43], "asyncllmengin": 88, "atom": 1, "attach": [2, 18], "attempt": [0, 2, 69, 70, 72], "attend": 76, "attent": [0, 1, 2, 6, 8, 9, 10, 12, 14, 15, 18, 19, 25, 59, 65, 77, 82, 83, 84, 87, 88, 89, 90, 93], "attention_backend": [90, 92], "attention_head_s": [77, 78], "attention_mask": [77, 78, 79, 82, 92], "attention_mask_param": 79, "attention_mask_typ": 78, "attention_multipli": 79, "attention_output": 87, "attention_output_orig_quant_scal": 77, "attention_output_sf_scal": 77, "attention_packed_mask": [77, 78], "attention_param": [78, 79], "attention_qk_half_accumul": 88, "attentionconfig": 0, "attentionheads": 1, "attentionmask": 92, "attentionmaskparam": 78, "attentionmasktyp": [77, 78], "attentionmetadata": 90, "attentionparam": [78, 79], "attentiontyp": 0, "attn_backend": 92, "attn_bia": 79, "attn_dens": [9, 25], "attn_forward_funcnam": 78, "attn_k": [9, 25], "attn_logit_softcap": 79, "attn_logit_softcapping_scal": 77, "attn_metadata": 90, "attn_processor": 79, "attn_q": [9, 25], "attn_qkv": [9, 25], "attn_v": [9, 25], "attribut": [0, 1, 3, 7, 15, 17, 82], "audio": [82, 88], "audio_engine_dir": 82, "audio_featur": 82, "audio_path": 82, "authent": [64, 70, 83], "authorized_kei": [27, 28], "auto": [0, 1, 2, 3, 5, 6, 11, 14, 38, 46, 65, 68, 75, 77, 79, 80, 81, 88], "auto_deploi": 88, "auto_parallel": [25, 38, 65, 88], "auto_parallel_config": 65, "auto_parallel_world_s": [38, 65], "auto_quantize_bit": 81, "autoawq": 88, "autodeploi": 88, "autogptq": 88, "autom": [40, 88], "automat": [0, 3, 7, 14, 15, 24, 26, 32, 36, 37, 64, 66, 68, 70, 77, 84, 85, 88], "autoparallelconfig": 65, "autopp": 88, "autoq": 88, "autoregress": [0, 10, 92, 93], "autotoken": 32, "autotun": 88, "autotuner_en": 46, "aux": 84, "auxiliari": 10, "avaiable_block": 94, "avail": [0, 1, 3, 7, 8, 14, 19, 21, 26, 32, 38, 41, 42, 44, 45, 47, 60, 66, 68, 74, 75, 76, 82, 83, 84, 85, 88, 89, 92, 93], "averag": [0, 10, 18, 65, 68, 69, 70, 72, 74, 75], "avg": [68, 70, 77], "avg_pool2d": 77, "avgnumdecodedtokensperit": 0, "avgpool2d": 78, "avoid": [1, 2, 17, 24, 60, 64, 82, 84, 88], "awai": [74, 75], "await": [0, 3, 32, 42, 43], "awaitcontextrespons": 0, "awaitgenerationrespons": 0, "awaitrespons": [0, 2, 3], "awar": [2, 5, 19, 87], "awq": [23, 32, 54, 59, 86, 88], "awq_block_s": 81, "ax": 77, "axi": [22, 77], "b": [1, 2, 7, 9, 14, 19, 20, 21, 22, 67, 77, 79, 82, 88], "b200": [69, 88], "b_sf": 77, "back": [0, 2, 8, 10, 41, 44, 62, 69, 88], "backbon": 66, "backend": [0, 2, 3, 10, 14, 16, 18, 26, 33, 40, 46, 47, 50, 51, 52, 59, 65, 67, 68, 69, 83, 88, 91, 93, 94], "backend_token": [0, 3], "backu": [0, 3, 65], "backward": 17, "bad": [0, 3, 65, 88], "bad_token_id": 65, "bad_words_data": 82, "bad_words_list": 82, "badword": 0, "badwordslen": 1, "badwordslist": 1, "badwordsptr": 1, "baichuan": [64, 85, 86, 88], "baichuan2": 86, "baichuanconfig": 79, "baichuanforcausallm": 79, "balanc": [4, 6, 10, 14, 74, 76], "band": 40, "bandwidth": [6, 14, 19, 20, 21, 23, 40], "bangbang": 20, "bantoken": 0, "banword": 0, "bar": 65, "bare": [88, 89], "barissglc": 53, "barnardo": 46, "bart": [86, 88], "base": [0, 1, 2, 3, 8, 9, 10, 12, 15, 16, 17, 18, 19, 20, 23, 24, 25, 42, 43, 49, 60, 65, 66, 68, 74, 76, 77, 78, 79, 80, 81, 82, 84, 86, 88, 89, 90, 91, 93, 94], "base64": 56, "base_model": 9, "base_s": 78, "base_url": [26, 55, 56, 57], "basekvcachemanag": 0, "baselin": [23, 24, 70, 74, 75, 92], "baseline_fp8_engin": 72, "basemodel": 65, "baseresourcemanag": [91, 93], "bash": [14, 26, 28, 29, 30, 31, 33, 34, 35, 50, 51, 52, 67], "basic": [12, 67, 77], "basic_string_view": 0, "batch": [0, 1, 6, 8, 9, 10, 11, 14, 16, 18, 20, 21, 23, 24, 25, 26, 47, 59, 63, 65, 68, 69, 70, 72, 73, 75, 76, 77, 78, 82, 83, 84, 87, 88, 90, 91, 92, 93, 94], "batch_beam_s": [5, 77], "batch_dim": 77, "batch_idx": 82, "batch_input_id": 82, "batch_manag": [0, 1, 93], "batch_schedul": 88, "batch_siz": [5, 7, 11, 13, 19, 22, 77, 78, 81, 82, 84, 92], "batchdon": 1, "batched_logits_processor": [47, 65], "batchedlogitsprocessor": [47, 65], "batchidx": 1, "batchindex": 1, "batching_typ": 65, "batchingtyp": [0, 65], "batchsiz": [0, 1, 6, 20], "batchsizelimit": 0, "batchsizet": 0, "batchslot": 1, "batchslotshostcopi": 1, "batchslotsrequestord": 1, "bc": 77, "beam": [0, 1, 6, 10, 16, 22, 25, 26, 32, 44, 59, 65, 77, 82, 84, 87, 88], "beam_search_diversity_r": [65, 82], "beam_width": [5, 6, 32, 77, 82, 88], "beam_width_arrai": 65, "beamhypothes": 1, "beamsearch": 0, "beamsearchbuff": 1, "beamsearchdiversityr": [0, 1, 6], "beamsiz": 0, "beamtoken": [0, 3], "beamwidth": [0, 1, 2, 3, 6, 65, 88], "beamwidtharrai": [0, 1, 6], "becam": 0, "becaus": [0, 3, 8, 18, 23, 24, 25, 32, 48, 53, 64, 68, 69, 70, 71, 72, 74, 76, 77, 84], "becom": [5, 6, 7, 8, 9, 14, 15, 23, 24, 46, 66], "been": [0, 3, 4, 5, 17, 20, 21, 24, 28, 49, 53, 60, 62, 65, 68, 72, 74, 77, 87, 88], "befor": [0, 1, 2, 3, 5, 7, 8, 9, 13, 14, 15, 24, 50, 51, 52, 59, 60, 62, 66, 67, 71, 72, 74, 76, 77, 79, 82, 84, 87, 88, 90, 91, 92, 93, 94], "beforehand": 70, "begin": [10, 64, 66, 71, 88, 90], "behav": [0, 84], "behavior": [2, 5, 69, 74, 77, 82, 84, 88], "behaviour": [0, 77], "behind": 20, "being": [0, 5, 8, 14, 17, 53, 65, 74, 87, 88, 92], "believ": [46, 68], "belong": 74, "below": [0, 5, 6, 7, 9, 18, 21, 22, 23, 27, 28, 68, 69, 72, 74, 75, 87], "bench": [18, 36, 37, 53, 68, 69, 73, 88], "benchmark": [24, 51, 59, 60, 67, 72, 73, 75, 83, 88], "benchmark_2nod": 26, "benefici": [68, 74, 75], "benefit": [7, 8, 21, 23, 25, 66, 74, 88], "bert": [25, 77, 85, 86, 88], "bert_attent": 77, "bert_attention_plugin": 25, "bert_context_fmha_fp32_acc": 25, "bertattent": 78, "bertattentionplugin": 77, "bertbas": 79, "bertforquestionansw": 79, "bertforsequenceclassif": [79, 86], "bertmodel": 79, "besid": 91, "best": [5, 14, 24, 48, 65, 67, 68, 71, 73, 74, 83, 88], "best_of": [65, 88], "best_path": 82, "best_path_len": 82, "best_path_length": 82, "best_perf_practice_on_deepseek": [24, 88], "bestpathindic": 1, "bestpathlength": 1, "beta": [26, 77], "beta_fast": 77, "beta_slow": 77, "better": [0, 2, 5, 6, 8, 15, 17, 22, 24, 25, 50, 51, 52, 65, 69, 71, 72, 75, 76, 88], "between": [0, 2, 5, 6, 8, 10, 14, 15, 17, 24, 30, 56, 62, 65, 67, 69, 71, 75, 76, 77, 78, 84, 87, 88, 90], "beyond": [1, 20, 72], "bf16": [5, 15, 17, 24, 59, 72, 75, 86, 88], "bfloat16": [5, 14, 25, 68, 70, 80, 85, 86, 88], "bhuvanesh09": 88, "bi": 5, "bia": [0, 3, 13, 14, 65, 77, 78, 79, 88], "bias": [13, 77], "bidirect": [77, 78], "bidirectionalglm": 77, "bigger": 8, "biggest": 8, "billion": 18, "bin": [13, 14, 15, 18, 26, 29, 30, 31, 33, 34, 35, 50, 51, 52, 67, 87, 88], "binari": [10, 14, 67, 77], "bind": [47, 59, 65, 76, 82, 84, 88, 91, 93, 94], "bindcapacityschedul": 94, "bit": [0, 1, 5, 20, 53, 77, 85], "bitmask": 88, "bl": [10, 79], "black": 7, "blackwel": [2, 18, 54, 59, 62, 71, 72, 86, 88], "blip": [85, 88], "blip2": [85, 86, 88], "blob": 24, "block": [0, 1, 2, 5, 6, 8, 14, 25, 32, 46, 47, 64, 65, 74, 77, 82, 84, 88, 93], "block_controlnet_hidden_st": 79, "block_hash": 46, "block_num": 77, "block_siz": [77, 78, 82], "block_sparse_block_s": 77, "block_sparse_homo_head_pattern": 77, "block_sparse_num_local_block": 77, "block_sparse_param": 78, "block_sparse_vertical_strid": 77, "blockhash": 0, "blockidx": 1, "blockptr": 1, "blocksiz": 0, "blockspars": 77, "blocksparseattnparam": 78, "blog": [18, 19, 22, 23, 24, 88], "bloodeagle40234": 88, "bloom": [6, 15, 85, 86, 88], "bloom_dict": 15, "bloomforcausallm": 79, "bloommodel": 79, "bm": 1, "bmm": 14, "board": 75, "bodi": 14, "book": 53, "bool": [0, 1, 7, 11, 13, 65, 77, 78, 79, 80, 82, 92], "boolean": [1, 3, 9, 77, 79, 80], "boost": [18, 24, 72, 74, 75], "born": [12, 14, 87], "borrow": [32, 44, 68], "bos_token_id": 82, "both": [0, 2, 4, 5, 7, 9, 10, 14, 15, 18, 20, 23, 24, 25, 36, 49, 65, 68, 69, 71, 74, 76, 77, 78, 84, 85, 88, 91, 92], "bottleneck": [4, 18, 23, 71, 74], "bottom": 28, "bound": [0, 6, 12, 14, 21, 24, 65, 68, 77, 82, 84], "boundari": [6, 14, 65, 77, 79, 81, 84], "box": [7, 18], "bpru": 88, "brahma": 68, "branch": [10, 19, 22, 65], "breadth": 10, "break": [10, 24, 64, 68, 75, 88, 94], "breakdown": [67, 68, 69, 70], "breviti": 18, "brief": [79, 82, 92], "briefli": [30, 56], "brife": 0, "bring": [23, 24, 90], "broadcast": [3, 24, 77], "broadcast_help": 77, "broader": [5, 88], "broken": [66, 74, 88], "bsz": 78, "bu": 60, "budget": [11, 74], "buffer": [0, 1, 2, 3, 8, 25, 26, 59, 65, 77, 88, 93], "buffer_0": 1, "buffer_1": 1, "buffer_2": 1, "buffer_alloc": 82, "buffercast": 1, "buffercastornul": 1, "bufferdatatyp": 1, "buffermanag": 84, "buffermanagertest": 1, "bufferptr": 1, "bufferrang": 1, "buffers": 1, "bufferview": 0, "bug": 88, "build": [2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 39, 44, 46, 48, 49, 53, 59, 64, 65, 66, 67, 71, 72, 73, 74, 76, 79, 80, 83, 84, 87, 88], "build_cach": 65, "build_config": [17, 25, 32, 39, 44, 48, 49, 53, 65, 72, 74, 75, 79], "build_dir": 60, "build_engin": 14, "build_flags_multiple_profil": 75, "build_serialized_network": 14, "build_wheel": [18, 60, 67], "buildcacheconfig": 65, "buildconfig": [11, 17, 32, 39, 44, 48, 49, 53, 65, 72, 74, 75, 88], "builder": [11, 14, 17, 65, 88], "builder_force_num_profil": 88, "builder_opt": 88, "built": [3, 6, 8, 14, 17, 25, 54, 60, 62, 64, 65, 68, 69, 70, 75, 76, 77, 83, 84, 87, 88], "bump": 1, "bumptaskinprogress": 1, "burden": 71, "busi": 0, "button": 88, "buvnswrn": 88, "bw": 88, "byt5": [86, 88], "byte": [0, 1, 65, 82], "bytestostr": 1, "c": [0, 1, 2, 5, 7, 10, 14, 16, 18, 26, 27, 28, 32, 50, 51, 52, 59, 65, 66, 67, 74, 77, 79, 83, 88, 91, 93, 94], "cach": [0, 1, 2, 3, 6, 9, 14, 17, 23, 24, 25, 26, 32, 36, 37, 39, 49, 59, 63, 65, 66, 68, 69, 70, 74, 77, 82, 83, 85, 88, 89, 90, 91, 92, 94], "cache_indir": 82, "cache_indir_t": 77, "cache_indirect": [5, 77, 78, 82, 87], "cache_root": 65, "cache_transceiver_config": 65, "cachehitr": 0, "cacheindirect": 1, "cachelevel": 0, "cachelevelupd": 0, "caches": 0, "cachest": 0, "cachetransceiv": 0, "cachetransceiverconfig": [0, 65], "cachetyp": 93, "cachevalu": 1, "calcul": [0, 19, 20, 22, 65, 68, 76, 77, 82, 84, 88], "calculate_speculative_resourc": 65, "calculatespeculativeresourc": 0, "calculatespeculativeresourcetupl": 0, "calib_batch": [54, 65, 72, 79], "calib_batch_s": [65, 72, 79], "calib_config": [54, 65, 72], "calib_dataset": [54, 65, 79, 81], "calib_max_seq_length": [54, 65, 72, 79, 81], "calib_s": [68, 81], "calibconfig": [54, 65, 72], "calibr": [15, 23, 25, 54, 65, 72, 88], "call": [0, 1, 3, 4, 5, 6, 7, 14, 15, 17, 32, 47, 65, 67, 70, 72, 77, 79, 81, 82, 83, 84, 88, 90, 91, 92, 93], "callabl": [15, 47, 65, 79], "callback": [3, 47, 65], "can": [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 14, 15, 16, 17, 18, 19, 20, 22, 23, 24, 25, 26, 27, 28, 32, 36, 39, 41, 44, 47, 48, 49, 50, 51, 52, 53, 54, 59, 60, 62, 64, 65, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94], "canaccessp": 1, "cancel": [0, 3, 65, 68, 88], "cancelrequest": [0, 3], "candid": [0, 6, 10, 14, 24], "canenqueu": 0, "canenqueuerequest": 0, "cannon": 46, "cannot": [1, 6, 14, 15, 24, 64, 65, 74, 75, 76, 77, 84, 87, 88, 94], "cap": 70, "capabl": [19, 24, 40, 60, 66, 67, 72], "capac": [0, 1, 19, 21, 23, 65, 94], "capacitor_schedul": 94, "capacity_scheduler_polici": [65, 76], "capacityschedul": [91, 93, 94], "capacityschedulerpolici": [0, 65, 76, 88], "capit": [36, 38, 39, 41, 42, 43, 44, 45, 49, 54, 61, 62, 70, 76, 83, 89], "caption": 78, "captur": [65, 92], "card": [48, 53], "carefulli": 18, "case": [0, 1, 2, 5, 6, 8, 9, 10, 18, 20, 23, 24, 25, 32, 68, 69, 70, 72, 73, 75, 77, 85, 88], "cast": 77, "cast_to_dtyp": 77, "castsiz": 1, "cat": [18, 26, 51], "categor": [10, 77], "categori": 80, "categorical_sampl": 77, "caus": [2, 3, 15, 17, 25, 65, 75, 87, 88], "causal": [77, 78, 92], "cautiou": 17, "caveat": 72, "cd": [12, 13, 18, 60, 68, 83, 87, 89], "ceil": [1, 79], "ceil_mod": [77, 78], "ceildiv": 1, "center": [20, 21], "central": 80, "certain": [2, 7, 13, 62, 66, 77], "cg": 79, "challeng": [24, 66], "chanc": [8, 25, 76], "chang": [2, 5, 6, 8, 9, 15, 17, 19, 21, 22, 60, 64, 65, 66, 68, 75, 77, 79, 82, 84, 87, 89, 93], "channel": [25, 77, 85, 88], "char": [0, 1], "charg": [6, 14, 92], "chart": 20, "chat": [10, 21, 31, 34, 36, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 53, 54, 57, 58, 61, 62, 64, 83, 88, 89], "chatbot": 53, "chatcmpl": 83, "chatglm": [64, 77, 85, 86, 88], "chatglm2": [64, 86, 88], "chatglm3": [64, 79, 86, 88], "chatglm_vers": 79, "chatglmconfig": 79, "chatglmforcausallm": 79, "chatglmgenerationsess": 82, "chatglmmodel": 79, "check": [2, 3, 36, 61, 62, 65, 69, 71, 72, 74, 75, 77, 82, 83, 84, 87, 88, 90], "check_accuraci": 13, "check_config": 79, "check_gpt_mem_usag": 84, "checkbeamsearchdiversityr": 0, "checkbeamwidth": 0, "checkbeamwidtharrai": 0, "checkearlystop": 0, "checklengthpenalti": 0, "checkminp": 0, "checkmintoken": 0, "checknorepeatngrams": 0, "checknumreturnsequ": 0, "checkpoint": [12, 15, 16, 17, 18, 24, 25, 26, 41, 49, 59, 64, 65, 68, 70, 72, 81, 82, 83, 85, 87, 88, 90], "checkpoint_dir": [9, 11, 12, 13, 14, 17, 25, 68, 83, 87], "checkposteriorvalu": 0, "checkrepetitionpenalti": 0, "checktemperatur": 0, "checktopk": 0, "checktopp": 0, "checktoppdecai": 0, "checktoppmin": 0, "checktoppresetid": 0, "chef": 87, "chmod": 27, "choic": [0, 10, 23, 25, 49, 68, 71, 77, 82, 83, 92], "choos": [14, 17, 24, 72, 77, 88], "chosen": [84, 94], "chrome": 67, "chrono": 0, "chunk": [0, 25, 59, 63, 65, 75, 77, 82, 84, 88], "chunk_dim": 78, "chunk_length": 88, "chunk_scan": 77, "chunk_siz": [77, 79], "chunkedcontextnexttoken": 1, "chunkedcontextnexttokenshost": 1, "ci": 1, "circular": 5, "citi": [54, 83], "ckpt": [49, 68, 83], "ckpt_dir": [14, 17, 79], "ckpt_llama_3": 14, "cl": [12, 17], "claim": [1, 15], "claimpag": 1, "claimpageswithevict": 1, "clamp": [65, 88], "clamp_val": 65, "class": [0, 1, 2, 5, 6, 7, 11, 12, 14, 15, 17, 23, 25, 32, 39, 41, 44, 47, 48, 49, 60, 64, 65, 71, 72, 75, 77, 78, 79, 80, 81, 82, 87, 88, 90, 91, 92, 94], "class_dropout_prob": 78, "class_label": 78, "classic": [14, 59], "classifi": [78, 79], "classmethod": [12, 17, 65, 78, 79, 82], "classvar": 65, "clean": [18, 60, 67, 87], "clear": [62, 74, 82], "clearli": 76, "cli": [13, 18, 32, 59, 68, 71, 72, 74, 75, 83], "click": [27, 28], "client": [0, 3, 26, 58, 69], "client_id": 47, "clientid": 0, "clip": 77, "clip_before_cast": 77, "clip_qkv": [78, 79], "clip_vision_model": 79, "clipvisiontransform": 79, "clock": 24, "clone": [9, 18, 60, 64, 70, 83, 87, 89], "clone_input": 7, "close": [5, 17, 18, 25, 75, 84], "closur": 77, "cloud": [20, 27, 28], "cls_token": 78, "cluster": [6, 14, 24, 25, 26, 62, 65, 88], "cluster_info": 88, "cluster_kei": [25, 88], "cluster_s": 26, "cmake": [60, 88], "cnn_dailymail": [54, 65, 79], "co": [0, 9, 18, 30, 56, 64, 77, 78, 83, 87], "coalesc": 47, "coast": 83, "code": [2, 5, 7, 10, 14, 17, 23, 24, 26, 32, 50, 51, 52, 59, 64, 65, 66, 67, 68, 77, 85, 86, 87, 88, 90, 93, 94], "codebas": 90, "codellama": 88, "codepath": 88, "codeqwen": 88, "coderham": 88, "cogvlm": [86, 88], "cogvlmattent": 78, "cogvlmconfig": 79, "cogvlmforcausallm": 79, "coher": [6, 88], "cohereconfig": 79, "cohereforcausallm": 79, "collabor": [6, 24, 54, 77], "collect": [1, 7, 10, 14, 24, 65, 69, 77, 90], "collect_and_bia": 78, "color": [53, 74], "column": [9, 77, 85], "columnlinear": [9, 12, 78], "com": [17, 18, 24, 60, 77, 83, 87, 88, 89], "combin": [0, 7, 10, 21, 24, 25, 49, 50, 51, 52, 68, 69, 72, 74, 78, 88, 92, 94], "combinedtimesteplabelembed": 78, "combinedtimesteptextprojembed": 78, "come": [6, 9, 20, 70, 71, 74, 76, 84, 87], "comm": 65, "comma": [77, 82], "command": [8, 9, 12, 13, 14, 17, 18, 26, 27, 28, 50, 51, 52, 60, 64, 67, 68, 70, 75, 80, 83, 84, 87, 88, 89], "commandr": 88, "comment": 88, "commmod": 0, "common": [0, 5, 8, 10, 18, 36, 46, 64, 77, 84, 93], "common_prefix": 46, "commonli": [7, 24, 26, 88], "commstat": 0, "commtyp": 0, "commun": [0, 2, 6, 14, 25, 54, 64, 66, 72, 77, 86, 88], "communicationmod": [0, 2], "communicationtyp": 0, "compani": 48, "compar": [1, 2, 15, 20, 21, 23, 72, 74, 75, 76, 77, 92], "comparison": [6, 20, 24, 68], "compat": [10, 17, 26, 60, 75, 78, 83, 86, 88, 90], "compbin": 9, "compil": [6, 16, 59, 62, 66, 67, 68, 77, 87], "complet": [0, 1, 2, 3, 6, 8, 10, 29, 30, 32, 55, 56, 58, 60, 64, 65, 66, 68, 69, 70, 74, 75, 83, 88, 93, 94], "completion_token": 83, "completionoutput": [32, 48, 65], "complex": [7, 10, 14, 24], "compli": 26, "complic": 90, "compon": [2, 3, 5, 14, 16, 23, 24, 59, 85, 91], "compos": [0, 6, 68], "comprehens": [18, 26, 66], "compress": 19, "compris": 23, "comput": [0, 1, 4, 5, 6, 8, 10, 14, 19, 20, 21, 23, 24, 25, 38, 41, 42, 44, 45, 47, 67, 68, 71, 72, 76, 77, 84, 87, 88, 90, 91, 92, 93], "compute_relative_bia": 78, "computecontextlogit": 1, "computegenerationlogit": 1, "computenumpackedmask": 1, "concat": [12, 24, 77], "concat_kvcach": 24, "concaten": [5, 9, 15, 24, 77, 90], "conced": 46, "concept": [14, 68, 73, 88, 93], "conceptu": 1, "concern": [14, 84], "conclus": 73, "concret": 90, "concur": 46, "concurr": [1, 2, 10, 18, 20, 24, 68, 88], "cond_proj_dim": 78, "conda": 88, "condit": [0, 1, 3, 6, 7, 10, 68, 77, 78, 88], "condition": 77, "conditioning_embed": 78, "conditioning_embedding_dim": 78, "conduct": [5, 68], "confess": 46, "config": [0, 1, 5, 8, 9, 11, 12, 15, 17, 18, 19, 26, 33, 46, 65, 68, 74, 78, 79, 80, 82, 87, 88, 90, 93], "config_class": 79, "config_dir": 79, "config_fil": [26, 65, 79], "configdict": 65, "configur": [0, 1, 2, 4, 5, 10, 15, 16, 18, 21, 25, 26, 39, 40, 44, 48, 49, 53, 60, 62, 65, 68, 69, 70, 73, 74, 76, 79, 82, 84, 87, 88, 92], "configuration_llama": 90, "configuration_mymodel": 90, "configuration_util": 90, "confirm": [38, 41, 42, 44, 45], "conform": 65, "conjunct": 74, "connect": [0, 14, 70, 71, 73], "connectionmanag": 0, "consecut": 6, "consequ": [2, 23, 71, 75], "conserv": [0, 76], "consid": [0, 1, 9, 10, 18, 23, 53, 54, 65, 69, 74, 77, 90, 94], "consider": [17, 23, 32], "consist": [7, 17, 20, 24, 66, 68, 70, 77, 85, 87, 92], "consol": 27, "consolid": 10, "const": [0, 1, 3], "const_iter": 1, "constant": [1, 5, 77, 84], "constant_to_tensor_": 77, "constantli": [38, 41, 42, 44, 45], "constants_to_tensors_": 77, "constantthreshold": 1, "constexpr": [0, 1], "constpointercast": 1, "constrain": [6, 23], "constraint": [0, 5, 6, 23, 62, 77], "construct": [0, 1, 3, 10, 14, 68, 77, 88, 92], "constructor": [0, 11, 53, 64, 83, 92], "consult": [10, 60, 67], "consum": [0, 7, 65, 77], "consumpt": [5, 20, 25], "contact": 77, "contain": [0, 1, 2, 3, 5, 6, 7, 9, 13, 14, 15, 16, 17, 24, 25, 26, 28, 50, 51, 52, 61, 62, 65, 66, 68, 69, 77, 79, 82, 83, 85, 86, 88, 89, 91, 92], "container_imag": [50, 51, 52], "container_img": 26, "content": [1, 9, 17, 26, 27, 29, 30, 31, 40, 55, 56, 59, 77, 83, 84, 88], "context": [0, 2, 4, 8, 23, 25, 59, 63, 65, 68, 73, 77, 82, 84, 87, 88, 92, 93, 94], "context_chunking_polici": [65, 76], "context_fmha": [9, 25], "context_fmha_fp32_acc": 88, "context_fmha_typ": [5, 84], "context_init": 94, "context_len": [82, 92], "context_length": [77, 78, 82, 87], "context_logit": [65, 82], "context_mem_s": 82, "context_onli": 65, "context_parallel_s": 65, "context_phas": 5, "context_pre_onli": 78, "context_request": 94, "contextchunkingpolici": [0, 65, 76, 88], "contextexecutor": 2, "contextfmha": 1, "contextidx": 0, "contextlogit": 0, "contextmanag": 64, "contextparallel": 1, "contextphaseparam": [0, 2, 65], "contextpositionid": 1, "contextprefillposit": 0, "contextrequest": 1, "contextrequestid": 2, "contextrespons": 2, "contigu": [2, 71, 77, 88], "continu": [1, 3, 5, 10, 21, 23, 25, 65, 66, 72, 74, 82, 94], "contract": 68, "contrast": [6, 10, 92], "contrib": 19, "contribut": [17, 68, 77, 88], "contributor": [24, 84], "control": [0, 2, 5, 6, 7, 32, 36, 37, 65, 67, 68, 70, 76, 77, 78, 82, 85, 88], "conv": 77, "conv1d": [25, 77, 78], "conv2d": [77, 78], "conv3d": [77, 78], "conv_bia": 77, "conv_kernel": 82, "conv_stat": 79, "conv_state_or_ptr": 77, "conv_transpose2d": 77, "conv_weight": 77, "conveni": [1, 12, 17, 60], "convent": [17, 77], "convers": [1, 15, 22, 23, 53, 59, 83, 88], "convert": [0, 1, 9, 11, 12, 13, 14, 15, 17, 66, 68, 70, 72, 83, 87, 88, 92], "convert_and_load_weights_into_trtllm_llama": 17, "convert_checkpoint": [9, 11, 12, 13, 14, 17, 70, 71, 83, 87, 88], "convert_coneckpoint": 4, "convert_hf_mpt_legaci": 88, "convert_util": 88, "convert_weights_from_custom_training_checkpoint": 17, "convkernel": 1, "convolut": [0, 82], "convtranspose2d": 78, "coordin": [10, 59, 77], "copi": [0, 1, 2, 8, 10, 25, 28, 65, 72, 77, 84, 88, 92], "copy_on_partial_reus": 65, "copyfrom": 1, "copyonpartialreus": 0, "copytask": 1, "copytaskmappag": 1, "copyto": 0, "copytocpu": 0, "copytogpu": 0, "copytomanag": 0, "copytopag": 1, "copytopin": 0, "copytopooledpin": 0, "core": [6, 7, 9, 11, 14, 17, 19, 20, 22, 60, 64, 68, 71, 83, 87, 88, 91], "coroutin": [42, 43, 65], "correct": [2, 3, 5, 9, 10, 88], "correctli": [8, 77, 88, 90], "correspond": [0, 1, 2, 4, 5, 7, 9, 10, 15, 17, 26, 65, 67, 75, 77, 78, 82, 85, 87, 88, 90], "cost": [8, 14, 24, 68, 71, 84, 88], "costli": 24, "could": [0, 2, 7, 8, 13, 41, 42, 43, 44, 45, 54, 65, 70, 84, 87, 88], "couldn": 74, "count": [0, 1, 6, 26, 34, 35, 64, 68, 79, 83], "count_include_pad": [77, 78], "countlocallay": 1, "countlowerranklay": 1, "cours": 10, "court": [38, 41, 42, 44, 45], "cover": [18, 72, 73, 75], "cp312": 60, "cp_config": 65, "cp_group": [77, 78], "cp_rank": [77, 78], "cp_size": [77, 78, 81, 88], "cp_split_plugin": 77, "cpp": [2, 3, 5, 6, 14, 18, 26, 51, 59, 60, 67, 68, 69, 70, 87, 88], "cpp_e2e": 82, "cpp_extens": 62, "cpp_llm_onli": 82, "cpp_onli": 60, "cpu": [0, 1, 8, 9, 11, 14, 24, 25, 26, 47, 62, 65, 77, 84, 87, 88, 92], "cpumemusag": [0, 65], "crash": 88, "creat": [1, 2, 3, 7, 8, 10, 11, 12, 14, 16, 17, 24, 26, 27, 32, 38, 41, 42, 43, 44, 45, 46, 47, 54, 55, 56, 57, 64, 65, 66, 68, 69, 70, 74, 75, 77, 78, 79, 82, 83, 84, 88, 90, 91, 92, 94], "create_allreduce_plugin": 77, "create_attention_const_param": 78, "create_builder_config": 11, "create_cuda_graph_metadata": 92, "create_execution_context": 82, "create_fake_weight": 77, "create_network": 14, "create_pytorch_model_based_executor": [93, 94], "create_runtime_default": 79, "create_sinusoidal_posit": 77, "create_sinusoidal_positions_for_attention_plugin": 77, "create_sinusoidal_positions_for_cogvlm_attention_plugin": 77, "create_sinusoidal_positions_long_rop": 77, "create_sinusoidal_positions_yarn": 77, "createloramodul": 1, "creation": [1, 65, 77, 84], "creativ": 6, "criteria": 82, "critic": [24, 68, 87], "crop": 78, "cropped_pos_emb": 78, "cross": [0, 9, 24, 65, 77, 82, 88], "cross_attent": [78, 82], "cross_attention_dim": 78, "cross_attention_mask": [78, 82], "cross_attention_mask_for_context": 82, "cross_attention_mask_for_gen": 82, "cross_attention_norm": 78, "cross_attention_norm_num_group": 78, "cross_attention_packed_mask": 78, "cross_attn_dens": [9, 25], "cross_attn_k": [9, 25], "cross_attn_q": [9, 25], "cross_attn_qkv": [9, 25], "cross_attn_v": [9, 25], "cross_kv": 77, "cross_kv_cache_block_offset": [78, 82], "cross_kv_cache_fract": [65, 82], "cross_kv_cache_gen": [78, 79], "cross_kv_length": 77, "cross_kv_reus": [78, 79], "crossattentionmask": 0, "crosskvcachefract": [0, 88], "crosskvcachestat": 0, "crucial": [10, 14, 23, 91], "ctor": 77, "ctx": 0, "ctx_request_id": 65, "ctxenginepath": 0, "ctxexecutorconfig": 0, "cu": [14, 24], "cu12": 88, "cu128": [61, 62], "cuassert": 87, "cublaslt": [25, 75], "cuda": [0, 1, 2, 5, 14, 18, 47, 54, 60, 61, 62, 65, 67, 68, 79, 82, 84, 87, 88, 92, 93], "cuda_arch": 60, "cuda_architectur": [18, 60], "cuda_graph_batch_s": [18, 69], "cuda_graph_cache_s": 65, "cuda_graph_inst": 87, "cuda_graph_mod": [65, 82, 87], "cuda_graph_padding_en": [18, 51, 69], "cuda_hom": 62, "cuda_launch_block": 87, "cuda_stream": 87, "cuda_stream_guard": 82, "cuda_stream_sync": 77, "cudadevicegetstreampriorityrang": 1, "cudaevent_t": 1, "cudaeventdisabletim": 1, "cudagraph": 88, "cudagraphcaches": 0, "cudagraphlaunch": 87, "cudagraphmod": 0, "cudamalloc": [1, 2], "cudamallocasync": [1, 2], "cudamemcpyasync": 47, "cudamempool": 1, "cudamempoolptr": 1, "cudaprofilerapi": 67, "cudart": 87, "cudastream": 0, "cudastream_t": 1, "cudastreamcreatewithflag": 1, "cudastreamnonblock": 1, "cudastreamptr": [0, 1], "cudeviceptr": 1, "cudnn": 88, "cumemgenericallocationhandl": 1, "cumlogprob": [0, 1], "cumlogprobscba": 1, "cumsum": [77, 88], "cumsumgenerationlength": 1, "cumsumlastdim": 77, "cumsumlength": 1, "cumul": [0, 1, 65, 77], "cumulative_logprob": [32, 48, 65], "curand": 88, "curl": [26, 58, 83], "currenc": 68, "current": [0, 1, 2, 3, 5, 9, 10, 18, 23, 24, 25, 32, 40, 53, 60, 68, 72, 74, 75, 76, 77, 82, 84, 86, 88, 89, 91, 92, 93, 94], "current_stream": 87, "currentexpandindic": 1, "curv": 22, "custom": [6, 14, 17, 19, 24, 25, 36, 37, 39, 47, 48, 49, 60, 66, 72, 75, 77, 82, 88, 91, 92], "custom_all_reduc": 88, "custom_mask": 77, "customallreduc": 88, "customized_key_dict": 15, "customized_preprocess": 15, "customizedmodulea": 15, "customizedmoduleb": 15, "cutlass": 88, "cxx11": 60, "cyclic": [59, 77, 82], "d": [1, 9, 26, 27, 29, 30, 31, 50, 51, 52, 53, 68, 77, 78, 83, 87, 88], "d0": 24, "d04e592bb4f6aa9cfee91e2e20afa771667e1d4b": 68, "dangl": 7, "data": [0, 1, 2, 5, 6, 14, 15, 19, 20, 21, 22, 23, 24, 25, 46, 56, 65, 68, 69, 70, 77, 79, 86, 87, 88, 90], "data_path": 51, "data_typ": [11, 13], "datacontext": 0, "dataset": [24, 30, 51, 54, 56, 65, 67, 72, 88], "dataset_fil": 69, "dataset_path": 68, "datatyp": [0, 1, 6, 14, 77, 82, 85, 87], "datatypetrait": 1, "date": 17, "datetim": 65, "dbrx": [85, 86, 88], "dbrxconfig": 79, "dbrxforcausallm": 79, "dconv": 77, "de": 1, "deactiv": 32, "dead": 88, "deal": [5, 7, 87], "dealloc": [1, 94], "death": [38, 41, 42, 44, 45], "debug": [0, 25, 26, 59, 60, 82, 84, 88], "debug_buff": 87, "debug_mod": [82, 87], "debug_tensors_to_sav": 82, "debugconfig": 0, "debuginputtensor": 0, "debugoutputtensor": 0, "debugtensor": 0, "debugtensornam": 0, "debugtensorsmaxiter": 0, "debugtensorsperiter": 0, "dec": [25, 82, 88], "decai": [0, 6, 65], "decid": [5, 13, 59, 68, 73, 74, 85, 91, 94], "decilmforcausallm": 86, "decis": [53, 77], "declar": [1, 6, 7, 17, 91, 93], "decltyp": [0, 1], "decod": [0, 1, 2, 5, 6, 12, 17, 24, 26, 36, 37, 59, 65, 68, 77, 82, 86, 88, 90, 93], "decode_batch": 82, "decode_duration_m": 65, "decode_regular": 82, "decode_retention_prior": 65, "decode_stream": 82, "decode_words_list": 82, "decode_wrapp": 92, "decodedurationm": 0, "decoder_batch": 1, "decoder_input_id": [79, 82], "decoder_language_adapter_rout": 82, "decoder_lay": 90, "decoder_start_token_id": 25, "decoderbuff": 1, "decoderenginebuff": 0, "decoderetentionprior": 0, "decoderjsonconfigstr": 0, "decoderlay": 90, "decoderlayerlist": 12, "decoderlookaheadbuff": 1, "decodermaskedmultiheadattent": 5, "decodermodel": [0, 79, 90], "decodermodelforcausallm": [12, 17, 79, 90], "decodermodelpath": 0, "decoderst": 88, "decoderxqarunn": 5, "decoding_typ": [18, 65], "decodingbaseconfig": 65, "decodingconfig": [0, 1], "decodinginputptr": 1, "decodingit": 0, "decodinglayerworkspac": 1, "decodingmod": [0, 1, 88], "decodingoutputptr": 1, "decompos": 5, "decor": 90, "decoupl": [24, 84], "decreas": [19, 20, 72], "dedic": [24, 87], "deduc": [25, 26, 88], "deep": [14, 20, 21, 67, 77, 88], "deepgemm": 18, "deeplearn": [77, 87], "deepseek": [26, 58, 67, 69, 86, 88], "deepseek_v1": 88, "deepseek_v2": 88, "deepseek_v3": [24, 88], "deepseekforcausallm": 79, "deepseekv1config": 79, "deepseekv2": 77, "deepseekv2attent": 78, "deepseekv2config": 79, "deepseekv2forcausallm": 79, "deepseekv3forcausallm": 86, "deepspe": 13, "def": [7, 12, 14, 15, 17, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 54, 61, 62, 70, 72, 75, 76, 83, 87, 89, 90, 94], "default": [0, 1, 2, 3, 4, 5, 6, 8, 13, 15, 17, 25, 26, 27, 32, 49, 54, 59, 60, 65, 67, 69, 72, 73, 74, 75, 76, 77, 79, 82, 83, 84, 85, 87, 88, 90, 92], "default_net": 77, "default_plugin_config": 79, "default_trtnet": 14, "defaultvalu": 1, "defer": 77, "defin": [0, 1, 3, 5, 7, 10, 13, 14, 15, 16, 17, 18, 21, 25, 66, 68, 75, 77, 78, 85, 88, 90, 92], "definit": [3, 5, 16, 17, 24, 59, 66, 77, 87], "deftruth": 88, "degrad": [0, 2, 25, 72], "degre": [38, 41, 42, 44, 45, 47, 69, 72, 75], "delai": [69, 88], "deleg": [77, 92], "delet": [0, 1, 80, 87], "deliv": [18, 19, 22, 24, 69], "delta": [0, 24, 77, 78], "delta_bia": 77, "delta_softplu": 77, "demand": 24, "demo": [24, 30, 56], "demonstr": [3, 15, 20, 24, 64, 70, 72, 74, 75], "denmark": 46, "denois": 78, "denot": 10, "dens": [4, 5, 9, 13, 15, 77], "dense_4h_to_h": 15, "dense_bia": 78, "dense_h_to_4h": 15, "densiti": 23, "dep": 60, "departur": 46, "depend": [0, 2, 3, 5, 6, 7, 10, 13, 21, 26, 62, 65, 69, 70, 72, 75, 77, 84, 87, 88, 93], "deploi": [10, 13, 26, 59, 62, 66], "deplot": [86, 88], "deploy": [23, 24, 66, 68, 72, 83, 88], "deprec": [25, 66, 68, 88], "deprecationwarn": 68, "depriv": 7, "depth": 10, "dequ": [0, 1], "dequant": [5, 59, 77], "deriv": [14, 15, 77, 84, 91], "descendli": 6, "describ": [5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 22, 28, 30, 56, 60, 64, 68, 69, 75, 77, 85, 87, 92], "descript": [0, 1, 6, 9, 26, 49, 59, 68, 69, 75, 77, 92], "deseri": 17, "deserializeadditionalmodeloutput": 0, "deserializeadditionaloutput": 0, "deserializebool": 0, "deserializecachest": 0, "deserializecachetransceiverconfig": 0, "deserializecommst": 0, "deserializecontextphaseparam": 0, "deserializedatatransceiverst": 0, "deserializedebugconfig": 0, "deserializedecodingconfig": 0, "deserializedecodingmod": 0, "deserializedisservingrequeststat": 0, "deserializedynamicbatchconfig": 0, "deserializeeagleconfig": 0, "deserializeexecutorconfig": 0, "deserializeextendedruntimeperfknobconfig": 0, "deserializeexternaldrafttokensconfig": 0, "deserializeguideddecodingconfig": 0, "deserializeguideddecodingparam": 0, "deserializeinflightbatchingstat": 0, "deserializeiterationstat": 0, "deserializeiterationstatsvec": 0, "deserializekvcacheconfig": 0, "deserializekvcacheretentionconfig": 0, "deserializekvcachestat": 0, "deserializelookaheaddecodingconfig": 0, "deserializeloraconfig": 0, "deserializemodeltyp": 0, "deserializemropeconfig": 0, "deserializeorchestratorconfig": 0, "deserializeoutputconfig": 0, "deserializeparallelconfig": 0, "deserializepeftcacheconfig": 0, "deserializeprompttuningconfig": 0, "deserializerequest": 0, "deserializerequestperfmetr": 0, "deserializerequeststag": 0, "deserializerequeststat": 0, "deserializerequeststatsperiter": 0, "deserializerequeststatsperiterationvec": 0, "deserializerespons": 0, "deserializeresult": 0, "deserializesamplingconfig": 0, "deserializeschedulerconfig": 0, "deserializesocketst": 0, "deserializespecdecfastlogitsinfo": 0, "deserializespeculativedecodingconfig": 0, "deserializestaticbatchingstat": 0, "deserializestr": 0, "deserializetensor": 0, "deserializetimepoint": 0, "deserializetokenrangeretentionconfig": 0, "design": [1, 10, 14, 15, 17, 18, 23, 24, 64, 70, 83, 91, 92, 93], "desir": [3, 69, 77, 83, 92], "destin": [50, 51, 52], "destroi": [1, 84], "destroyipcmemori": 1, "destructor": 1, "detail": [0, 3, 5, 10, 12, 14, 18, 24, 25, 26, 32, 36, 40, 54, 59, 68, 69, 70, 72, 76, 77, 79, 84, 87, 88, 91, 92], "detect": [0, 3, 26, 77, 88], "detect_format": 15, "determin": [0, 1, 5, 6, 9, 17, 65, 71, 72, 76, 77, 79, 85, 91, 93, 94], "determinenumpag": 1, "determinist": [75, 88], "detoken": [65, 88, 91], "detokenizedgenerationresultbas": 65, "dev": [61, 62, 88], "devel": [27, 28, 60], "develop": [12, 13, 14, 17, 24, 27, 38, 41, 42, 44, 45, 59, 60, 64, 66, 70, 77, 86, 88, 90], "deviat": 69, "devic": [0, 1, 2, 47, 65, 72, 77, 79, 81, 82, 87], "device_id": 82, "device_map": 81, "device_memory_size_v2": 84, "device_request_typ": 79, "deviceallocationnvl": 1, "devicecach": 1, "devicecacheperc": 0, "deviceid": [0, 1, 2], "dgx": [6, 14, 18], "diagon": 77, "diagram": 10, "diamond": 24, "dict": [12, 15, 17, 65, 77, 79, 82, 88, 90, 93], "dict_kei": 87, "dictat": 74, "dictionari": [13, 15, 65, 78], "didn": 74, "differ": [0, 1, 2, 4, 5, 6, 8, 12, 13, 14, 15, 17, 18, 23, 25, 30, 56, 60, 64, 65, 66, 68, 70, 72, 74, 75, 77, 79, 82, 84, 85, 88, 92], "differenti": 77, "difftyp": 1, "diffus": [30, 56, 78, 88], "diffusersattent": 78, "digit": 66, "dilat": [77, 78], "dim": [0, 1, 77, 78, 79, 82, 87], "dim0": 77, "dim1": 77, "dim_head": 78, "dim_in": 78, "dim_out": 78, "dim_rang": 77, "dimems": 1, "dimens": [0, 1, 5, 6, 9, 77, 78, 79, 84, 87, 88, 90], "dimension": 77, "dimrang": 77, "dimtype64": [0, 1], "dir": [32, 60, 64], "direct": [0, 2, 17, 62, 87], "directli": [0, 2, 6, 7, 10, 14, 17, 28, 32, 60, 64, 68, 75, 76, 77, 83, 88, 92, 94], "directori": [3, 12, 13, 14, 15, 17, 25, 50, 51, 52, 60, 65, 68, 69, 70, 79, 82, 83, 88, 90], "disabl": [0, 1, 5, 6, 8, 11, 15, 25, 68, 72, 75, 76, 77, 80, 82, 84, 88], "disable_forward_chunk": 79, "disable_kv_cach": 82, "disable_weight_only_quant_plugin": 79, "disable_xqa": 5, "disablelookahead": 1, "disablelookaheaddecod": 1, "disableseamlesslookaheaddecod": 1, "disadvantag": [17, 71], "disagg_executor": 0, "disaggexecutororchestr": [0, 2], "disaggreg": [0, 59, 65, 88], "disaggregated_param": 65, "disaggregatedparam": 65, "disaggserverbenchmark": [2, 88], "disaggserverutil": 2, "discard": 72, "disclaim": [70, 72, 74, 75], "disclosur": 88, "disconnect": 88, "discourag": [0, 6, 65], "discov": [14, 62], "discrep": [60, 90], "discuss": [5, 70, 72, 75, 76, 88], "disk": [3, 17, 41, 44, 60, 64], "dispatch": [0, 4, 17, 24, 32], "displai": 65, "disservingrequeststat": 0, "disservingstat": 0, "dist": [18, 51, 62, 67, 68, 69, 70], "distanc": [5, 77], "distil": 88, "distinct": [9, 10, 24, 77], "distinguish": 8, "distribut": [1, 4, 5, 6, 14, 24, 36, 37, 68, 77, 82, 84], "distserv": 2, "disturb": 46, "dit": [79, 88], "div": 77, "dive": [66, 67], "divers": [0, 6, 67], "diversity_penalti": 6, "divid": [15, 77, 88], "divup": 77, "dl": 23, "do": [1, 2, 7, 15, 17, 18, 23, 24, 32, 59, 62, 70, 72, 75, 77, 83, 87, 90, 92], "do_cross_attent": [77, 78], "do_layer_norm_befor": 13, "do_sampl": 6, "doc": [1, 18, 22, 24, 28, 72, 75, 77, 87, 88], "docker": [18, 50, 51, 52, 59, 83, 87, 88], "docker_run_arg": 18, "dockerfil": [27, 60], "document": [0, 2, 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 20, 21, 23, 29, 30, 31, 32, 33, 34, 35, 40, 55, 56, 57, 60, 62, 63, 67, 69, 70, 76, 77, 84, 85, 87, 91, 92], "doe": [0, 2, 5, 9, 10, 17, 18, 19, 25, 68, 69, 75, 77, 82, 84, 86, 88, 90, 94], "doesn": [1, 5, 24, 27, 32, 68, 74, 75], "dollar": 68, "domin": [24, 88], "don": [10, 17, 27, 71, 75, 77], "done": [1, 8, 14, 18, 66, 68, 72, 74, 77, 80, 90], "dongjiyingdji": 88, "dora": [25, 77, 78], "dora_plugin": [9, 25, 77], "dot": [15, 24, 77], "doubl": [0, 20, 73, 75, 87], "down": [0, 2, 3, 9, 19, 53, 66, 71, 77, 82], "down_proj": 15, "download": [16, 50, 51, 52, 53, 60, 61, 62, 64, 68, 70, 83, 87, 88], "downscale_freq_shift": 78, "downsid": 75, "downstream": 85, "dp": [18, 19, 22, 24, 88], "dp8": 24, "dprank": 0, "dpsize": 0, "dq": 59, "draft": [0, 1, 24, 25, 59, 82, 88], "draft_indic": 79, "draft_len": 79, "draft_path": 82, "draft_prob": 79, "draft_target_model": 10, "draft_token": [65, 79], "draft_tokens_extern": [25, 79], "draftacceptancethreshold": 1, "draftbuff": 1, "drafter": 10, "draftindic": 1, "draftlen": 1, "draftlogit": 1, "draftparticipantid": 0, "draftpath": 1, "draftpathshost": 1, "draftprob": 1, "draftrequestid": 0, "drafttoken": [0, 1], "drafttokenid": 1, "drafttokensextern": 1, "dram": 14, "dreamgenx": 88, "drive": [14, 68], "driven": 66, "driver": [84, 88], "drop": [72, 74, 76], "dropout": 78, "dropout_prob": 78, "dry_run": [25, 65, 88], "dst": 1, "dstate": 77, "dsttype": 1, "dt_proj": 77, "dt_rank": 77, "dtype": [1, 7, 9, 11, 12, 13, 14, 17, 65, 68, 70, 71, 77, 78, 79, 80, 81, 82, 87, 88, 93], "dual": 60, "due": [0, 10, 17, 21, 24, 60, 68, 70, 74, 76, 82, 88, 92], "dummi": [65, 70, 88], "dump": [0, 3, 60, 65], "dump_debug_buff": 82, "duplic": 88, "duplicate_data": 77, "durat": [0, 70], "duration_m": 65, "durationm": 0, "dure": [0, 1, 5, 6, 7, 10, 11, 14, 22, 24, 25, 60, 65, 67, 68, 75, 76, 82, 84, 87, 92, 93], "dynam": [0, 24, 25, 65, 68, 77, 79, 82, 84, 88, 94], "dynamic_batch_config": 65, "dynamic_batch_moving_average_window": 65, "dynamic_quant_bf16tonvfp4": 24, "dynamic_tree_max_topk": [39, 65], "dynamicbatchconfig": [0, 65], "dynamicbatchmovingaveragewindow": 0, "dynamicbatchsizeconfig": 0, "dynamicdecodelay": 1, "dynamicqu": 24, "dynamictreemaxtopk": 0, "dynamictreemaxtopkhost": 1, "e": [0, 2, 3, 5, 8, 9, 15, 26, 27, 47, 50, 51, 52, 60, 65, 67, 77, 80, 82, 85, 87, 88, 90], "e2": 59, "e4m3": 20, "e5m2": 20, "each": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 18, 24, 25, 26, 32, 47, 50, 51, 52, 65, 68, 69, 70, 71, 74, 75, 76, 77, 78, 80, 82, 84, 85, 87, 88, 91, 92, 93, 94], "eager": [66, 88], "eagl": [0, 1, 25, 36, 37, 59, 65, 79, 82, 88], "eagle_choic": [39, 65, 82], "eagle_dynamic_tree_max_top_k": 82, "eagle_posterior_threshold": 82, "eagle_temperatur": 79, "eagle_use_dynamic_tre": 82, "eaglechoic": [0, 1], "eagleconfig": [0, 1, 79], "eagledecodingconfig": [39, 65], "eagleforcausallm": 79, "eagleinput": 1, "eaglelastinput": 1, "eaglenetctxcontextlengthshost": 1, "eaglenetctxpastkeyvaluelengthshost": 1, "eaglenetctxrequesttypeshost": 1, "eaglenetgencontextlengthshost": 1, "eaglenetgenpastkeyvaluelengthshost": 1, "eaglenetgenrequesttypeshost": 1, "earli": [82, 87, 88], "earlier": [0, 13, 72, 87], "early_stop": [6, 65, 82, 88], "early_stop_criteria": 82, "earlystop": [0, 1, 6], "eas": [16, 66, 69], "easi": [23, 70], "easier": [14, 17, 18, 68], "easili": [15, 16, 18, 24, 66, 77], "east": [12, 14, 87], "eastern": 83, "ebnf": [0, 3, 65], "echo": [26, 27, 28, 51, 52], "eddi": 88, "edg": 20, "edit": [10, 60], "ef648e7489c040679d87ed12db5d3214": 83, "effect": [0, 2, 6, 10, 24, 25, 62, 65, 72, 74, 75], "effici": [4, 5, 6, 8, 10, 14, 16, 24, 25, 30, 38, 41, 42, 44, 45, 56, 84, 86, 89, 91, 92, 93], "effort": [10, 13, 54, 72, 88], "eg": 69, "eight": [18, 19], "einop": 77, "einstein": 77, "einsum": 77, "einsum_eq": 77, "either": [0, 1, 2, 3, 16, 24, 41, 44, 54, 65, 77, 84, 87, 88], "element": [0, 1, 5, 6, 9, 65, 77, 78, 85], "element_typ": 1, "elementwis": [7, 77], "elementwise_affin": 78, "elementwise_binari": 77, "elementwise_sub": 7, "elementwise_sum": 7, "elementwiseoper": [7, 77], "eleutherai": 68, "elif": 94, "elimin": [2, 10, 24, 25, 66, 68, 72, 74, 88], "ellipsi": 77, "els": [0, 14, 15, 17, 32, 47, 49, 54, 77, 87, 94], "elsinor": 46, "emb": [14, 56, 78], "embark": 66, "embed": [0, 8, 12, 25, 65, 68, 77, 82, 88, 90, 92], "embed_dim": 78, "embed_posit": 78, "embed_positions_for_gpt_attent": 78, "embed_positions_for_gpt_attention_loc": 78, "embed_positions_loc": 78, "embed_token": [15, 90], "embedding_bia": 65, "embedding_dim": 78, "embedding_multipli": 79, "embedding_parallel_mod": 65, "embedding_scal": 79, "embedding_sharding_dim": [13, 79], "embeddingbia": [0, 1], "embeddingt": [0, 1], "emerg": [23, 24], "emphasi": 13, "emploi": [10, 91, 94], "empow": 24, "empti": [0, 1, 10, 32, 77, 88, 94], "emptybuff": 1, "emptygenslot": 0, "emptytensor": 1, "emul": [77, 88], "en": 88, "enabl": [0, 2, 3, 5, 6, 7, 9, 10, 11, 14, 15, 20, 21, 22, 23, 24, 25, 26, 28, 32, 38, 43, 45, 59, 60, 61, 62, 65, 68, 70, 74, 76, 77, 78, 79, 80, 82, 83, 85, 87, 88, 90, 92, 93], "enable_allreduc": 77, "enable_attention_dp": [18, 26, 51, 65], "enable_batch_size_tun": 65, "enable_block_reus": [26, 39, 46, 49, 65], "enable_build_cach": [65, 88], "enable_chunked_context": [82, 88], "enable_chunked_prefil": [65, 88], "enable_context_fmha_fp32_acc": [65, 82], "enable_debug_output": [25, 65, 87], "enable_forward_chunk": 79, "enable_fp8": 54, "enable_if_t": 1, "enable_iter_perf_stat": 26, "enable_kv_cache_reus": 8, "enable_lora": [53, 65], "enable_max_num_tokens_tun": [65, 88], "enable_multi_devic": 88, "enable_nvfp4": 54, "enable_overlap_schedul": [26, 69], "enable_partial_reus": 65, "enable_prompt_adapt": [65, 88], "enable_qkv": 78, "enable_tqdm": 65, "enable_trt_overlap": 88, "enable_ucx": 88, "enable_xqa": 88, "enableattentiondp": [0, 1], "enablebatchsizetun": 0, "enableblockreus": [0, 8], "enablechunkedcontext": 0, "enablecontextfmhafp32acc": 0, "enabled_with_fp32_acc": 5, "enablelookaheaddecod": 1, "enablemaxnumtokenstun": 0, "enablepartialreus": 0, "enableseamlesslookaheaddecod": [0, 1], "enabletrtoverlap": 0, "enc": [25, 82, 88], "enc_dec": 6, "encapsul": [5, 6, 14, 77], "encdecmodelrunn": 82, "encod": [0, 5, 6, 20, 24, 25, 65, 77, 82, 85, 86, 88], "encode_base64_content_from_url": 56, "encoded_vocab": [0, 3], "encodedvocab": [0, 3], "encoder_hidden_st": [78, 79], "encoder_input_featur": 82, "encoder_input_id": 82, "encoder_input_len_rang": 88, "encoder_input_length": [77, 78, 82], "encoder_language_adapter_rout": 82, "encoder_max_input_length": [78, 82], "encoder_output": [78, 79, 82], "encoder_output_length": 82, "encoder_run": 82, "encoderenginebuff": 0, "encoderhiddens": 1, "encoderinputfeatur": 0, "encoderinputtokenid": 0, "encoderjsonconfigstr": 0, "encoderlen": 0, "encodermodel": [0, 79], "encodermodelpath": 0, "encoderoutput": 0, "encoderoutputlength": 0, "encount": [15, 18, 62, 87], "encourag": [0, 6, 17, 65], "end": [0, 1, 5, 6, 14, 25, 39, 44, 48, 49, 54, 65, 66, 68, 72, 75, 76, 77, 83, 88, 93], "end_dim": 77, "end_id": [65, 82, 88], "end_token": [0, 65], "endeavor": 24, "endid": [0, 1], "endpoint": [34, 35, 65, 83, 88], "endswith": 15, "enforc": [70, 77], "engin": [0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 16, 17, 22, 24, 25, 26, 32, 41, 44, 53, 59, 62, 65, 69, 71, 72, 74, 75, 76, 77, 79, 82, 84, 87, 88], "engine_buff": 82, "engine_dir": [11, 12, 13, 14, 17, 65, 68, 70, 82, 83, 87], "engine_inspector": 82, "engine_llama_3": 14, "engine_nam": 82, "engine_output": 25, "engineaddr": 1, "enginebuff": [0, 1], "enginefilenam": 1, "engineinput": 1, "engineoutput": 1, "enginepath": 1, "engines": 1, "enhanc": [4, 6, 10, 18, 24, 66, 76, 84, 89, 92], "enjoi": [28, 38, 41, 42, 44, 45, 47], "enough": [5, 8, 18, 74, 84, 91, 94], "enqueu": [0, 3, 14, 82, 84, 88], "enqueuecontext": 0, "enqueuegener": 0, "enqueuerequest": [0, 2, 3], "ensur": [2, 3, 4, 7, 17, 60, 68, 74, 80, 90, 93], "enter": [7, 27, 69, 74, 93], "enterpris": 40, "entir": [0, 3, 9, 14, 19, 24, 66, 68, 69, 77, 84, 93], "entri": [0, 9, 36, 45, 61, 62, 68, 77, 83, 88], "entrypoint": [27, 64, 70], "enum": [0, 1, 2], "enumer": [0, 1, 43, 47, 89], "env": [26, 29, 30, 31, 33, 34, 35, 68], "envelop": 48, "environ": [6, 10, 18, 24, 30, 50, 51, 52, 56, 59, 60, 62, 67, 68, 70, 72, 74, 75, 87, 88, 89, 92], "environment": 15, "eo": [6, 65], "eof": [18, 26, 51], "eos_token_id": [3, 82], "ep": [4, 18, 24, 26, 68, 77, 78], "ep2": 24, "ep2tp4": 24, "ep4tp2": 24, "ep8tp8": 24, "ep_siz": [26, 33], "epsilon": [0, 77], "eq": 77, "equal": [0, 1, 3, 4, 25, 32, 71, 77, 78, 84], "equal_progress": [65, 76], "equat": [22, 77], "equip": [2, 16], "equival": [24, 72, 77, 90], "equvili": 25, "erenup": 88, "err": [50, 51, 52], "error": [0, 2, 3, 9, 17, 25, 26, 54, 59, 60, 62, 65, 70, 74, 84, 88], "errorcod": 64, "errormsg": 0, "especi": [7, 25, 38, 41, 42, 44, 45, 47, 71, 74, 93], "essenti": [10, 68], "estim": [54, 68, 88, 94], "et": 19, "etc": [0, 1, 10, 62, 67, 72, 75, 82, 84, 87, 90], "ethnzhng": 88, "eval": 40, "evalu": [20, 21, 59, 88], "even": [2, 5, 6, 14, 17, 23, 24, 25, 46, 70, 74, 77, 84], "evenli": [4, 24], "event": [0, 1, 36, 37, 65], "event_buffer_max_s": [46, 65], "event_id": 46, "eventbuffermaxs": 0, "eventid": 0, "eventptr": 1, "ever": [0, 75], "everi": [0, 3, 15, 24, 68, 70, 71, 77, 82], "everyth": 14, "evict": [0, 1, 8, 9, 66, 68, 70, 74], "evolv": [5, 17, 24, 66, 85, 93], "ex": [51, 52], "exact": [5, 84], "exam": 24, "examin": 10, "exampl": [0, 5, 6, 7, 8, 10, 11, 12, 16, 17, 19, 21, 23, 26, 32, 40, 47, 50, 54, 59, 60, 64, 65, 69, 70, 71, 72, 73, 74, 75, 76, 77, 82, 83, 84, 85, 86, 87, 88, 89, 90, 92, 94], "example_logits_processor": 47, "exaon": [15, 86, 88], "exc": 43, "exce": [0, 2, 76, 77], "exceed": [0, 84], "except": [0, 3, 5, 6, 17, 24, 25, 49, 71, 77, 87, 88], "excess": 5, "exchang": 65, "excit": [38, 41, 42, 43, 44, 45], "exclud": [65, 72, 77, 88], "exclude_input_from_output": 65, "exclude_modul": [13, 65, 88], "excludeinputfromoutput": 0, "exclus": [1, 6, 85, 88], "exec": 67, "execut": [0, 2, 3, 6, 9, 10, 14, 16, 17, 24, 59, 65, 66, 67, 68, 74, 76, 77, 82, 83, 84, 91, 94], "executor": [1, 2, 8, 10, 11, 16, 32, 47, 53, 59, 65, 66, 68, 76, 82, 84, 88, 91], "executor_config": 93, "executorconfig": [0, 3, 11], "executorexampledisaggreg": 2, "executorexamplefastlogit": 88, "exhaust": [0, 16], "exist": [1, 6, 8, 9, 10, 15, 17, 24, 25, 46, 62, 65, 68, 82, 88, 92], "exit": [69, 82], "exp": 77, "expand": [0, 21, 23, 77, 82, 88], "expand_dim": 77, "expand_dims_lik": 77, "expand_mask": 77, "expand_shap": 77, "expans": 77, "expect": [0, 5, 6, 12, 14, 15, 17, 21, 25, 32, 50, 51, 52, 59, 65, 68, 70, 73, 77, 87, 88], "expens": [3, 10, 66, 71, 72, 76], "experi": [10, 22, 23, 24, 64, 66, 67, 68, 87], "experiment": [5, 6, 10, 15, 26, 50, 51, 52, 59, 68, 85, 88, 89], "expert": [9, 18, 26, 45, 59, 65, 75, 88], "expertis": 24, "expir": 0, "explain": [6, 14, 16, 74, 77, 84, 85, 91, 92], "explan": [18, 75, 82, 84], "explicit": [0, 1, 10, 77, 88], "explicit_draft_token": [10, 25, 79], "explicitdrafttoken": [0, 1], "explicitdrafttokensinput": 1, "explicitdrafttokenslastinput": 1, "explicitdrafttokensmodul": 1, "expliciteosstop": 0, "explicitli": [1, 2, 7, 10, 14, 15, 25, 26, 32, 65, 88], "explor": [10, 24, 66], "expon": 20, "exponenti": 10, "export": [2, 13, 17, 18, 24, 25, 26, 34, 35, 50, 51, 52, 68, 81, 82, 87, 88], "export_fmt": 89, "expos": [0, 6, 14, 28, 60, 72, 88], "express": [0, 3, 65, 77], "extend": [0, 3, 8, 14, 24, 65, 75, 77, 88], "extended_runtime_perf_knob_config": [65, 88], "extendedruntimeperfknobconfig": [0, 65], "extens": [13, 16, 62, 66, 68, 88], "extern": [0, 7, 15, 82, 84], "external_checkpoint_dir": 15, "external_kei": 15, "external_weight": 15, "externaldrafttoken": 0, "externaldrafttokensconfig": [0, 1], "externaldrafttokensinput": 1, "externalstream": 47, "extra": [0, 2, 5, 8, 10, 13, 18, 24, 25, 26, 33, 62, 69, 71, 72, 82, 88], "extra_arg": 51, "extra_id": 8, "extra_llm_api_opt": [18, 26, 33, 51, 68, 69], "extra_token": 78, "extract": [0, 3, 60, 67, 73, 77, 82], "extrapol": 77, "extrem": [14, 24, 72, 74, 75], "f": [0, 5, 6, 27, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 54, 61, 62, 65, 67, 70, 76, 77, 83, 87, 89], "face": [3, 9, 11, 16, 17, 32, 65, 68, 79, 83, 88], "facilit": [7, 10, 83], "fact": [66, 68, 75], "factor": [23, 71, 72, 77, 78, 84, 85], "factori": [17, 65, 82, 88], "factual": 6, "fail": [65, 82, 84, 87, 94], "failur": [15, 88], "fairli": 14, "fairseq": [86, 88], "fake": [8, 88], "fakebuff": 1, "falcon": [13, 23, 64, 68, 85, 86, 88], "falconconfig": 79, "falconforcausallm": 79, "falconmodel": 79, "fall": [62, 69, 88], "fallback": 15, "fals": [0, 1, 2, 3, 5, 6, 7, 8, 13, 24, 25, 26, 40, 46, 49, 51, 65, 77, 78, 79, 80, 81, 82, 88], "false_output_valu": 77, "false_valu": 77, "famili": [5, 15, 86, 88], "familiar": [6, 14, 64, 70, 71, 73, 83], "famou": [6, 54], "faq": 59, "far": [0, 3], "fast": [0, 5, 10, 65, 68, 71, 88], "fast_build": [25, 65, 88], "fastapi": 88, "fastapi_serv": 88, "faster": [5, 17, 20, 21, 25, 69, 70, 77], "fasterdecod": 49, "fastlogit": 0, "fault": 88, "favor": 88, "favorit": 53, "fc": [13, 14, 15, 87], "fc_gate": 78, "fc_gate_dora": 78, "fc_gate_lora": 78, "fc_gate_plugin": 78, "featur": [0, 2, 3, 5, 7, 9, 10, 13, 14, 15, 17, 23, 24, 25, 50, 51, 52, 59, 60, 68, 72, 74, 75, 76, 77, 80, 82, 86, 89, 90, 92], "feature_dim": 82, "fed": [69, 79], "feed": 77, "feedback": 88, "feedforward": 4, "feel": 53, "fetch": [0, 26, 91], "few": [8, 14, 17, 23, 74], "fewer": [5, 10, 19, 92], "ffn": [4, 24], "ffn_hidden_s": 78, "fhma": 88, "field": [0, 6, 13, 17, 26, 28, 32, 65, 66, 68, 72, 79, 80, 85, 88, 92], "figur": 24, "file": [0, 3, 4, 5, 7, 8, 13, 14, 15, 17, 18, 25, 26, 34, 35, 62, 65, 67, 68, 69, 82, 83, 88, 90], "filepath": 1, "filesystem": [0, 1], "fill": [1, 15, 28, 38, 41, 42, 44, 45, 77, 92], "fill_attention_const_params_for_long_rop": 78, "fill_attention_const_params_for_rop": 78, "fill_attention_param": 78, "fill_none_tensor_list": 78, "fill_valu": [47, 77], "fillemptyfieldsfromruntimedefault": 0, "filloper": 77, "filltaskstensor": 1, "filter_medusa_logit": 82, "final": [0, 1, 9, 24, 25, 26, 27, 32, 77, 94], "final_logit_softcap": 79, "final_output_id": 82, "finalize_decod": 82, "find": [18, 72, 77, 87, 88], "find_best_medusa_path": 82, "fine": [10, 18, 68, 75, 78], "finer": 7, "finetun": 24, "finish": [0, 1, 3, 6, 17, 32, 48, 64, 65, 66, 68, 82, 91, 93], "finish_reason": [48, 65, 83, 88], "finishedst": 1, "finishedsum": 1, "finishreason": [0, 1, 88], "first": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 16, 21, 23, 25, 26, 27, 62, 64, 65, 68, 69, 70, 72, 74, 75, 76, 77, 84, 87, 88, 90, 92, 93, 94], "first_come_first_serv": [65, 76], "first_gen_token": 65, "first_lay": 82, "firstgentoken": 0, "firstit": 0, "firstli": [27, 74, 84], "firstscheduledtim": 0, "firsttokentim": 0, "fit": [1, 5, 19, 20, 65, 71, 72, 94], "fitting_request": 94, "fix": [9, 10, 68, 84], "fjosw": 88, "flag": [0, 1, 3, 5, 9, 17, 22, 26, 32, 59, 68, 72, 73, 74, 76, 77, 84, 88], "flags_siz": 1, "flan": [85, 86], "flash": [5, 14], "flashattent": [5, 14, 83], "flashinf": 92, "flashinferattent": 92, "flashmla": 88, "flatten": [1, 9, 22, 77, 78], "flattenedinouts": 1, "flattenn": 1, "flayer": 7, "flayerinfomemo": 7, "flexibl": [10, 17, 24, 32, 60], "flight": [1, 16, 59, 68, 74, 76, 83, 84, 88], "flip": 77, "flip_sin_to_co": 78, "float": [0, 1, 6, 11, 13, 14, 20, 47, 65, 76, 77, 78, 79, 82, 85], "float16": [7, 9, 11, 12, 13, 17, 25, 71, 77, 79, 80, 83, 87], "float2": 77, "float32": [0, 13, 25, 77, 78, 79, 80], "floattensor": 90, "floattyp": [0, 1], "floor_div": 77, "floordiv": 77, "flow": [7, 17, 24, 70, 71, 72, 74, 75, 88, 91, 94], "fly": [5, 77, 85], "fmha": [0, 25, 65, 77, 82, 84, 88], "fmt_dim": 1, "focu": [7, 23, 24, 67], "focus": [10, 68, 72, 73, 88], "fold": 84, "folder": [0, 3, 6, 17, 70, 85, 86, 88], "folder_trt_llm": 14, "follow": [3, 6, 7, 9, 10, 12, 13, 14, 15, 17, 18, 23, 24, 25, 26, 28, 32, 42, 43, 46, 50, 51, 52, 60, 61, 62, 64, 68, 69, 70, 71, 72, 73, 74, 75, 77, 83, 85, 86, 88, 89, 90, 92, 93], "footprint": [5, 19, 84], "for_each_rank": 79, "forc": [5, 24, 68], "force_drop_id": 78, "force_multi_block_mod": 68, "force_nccl_all_reduce_strategi": 88, "force_num_profil": 65, "force_words_id": 6, "forecast": 10, "foretel": 46, "fork": 67, "form": [0, 3, 5, 10, 65, 77, 83], "format": [0, 3, 13, 15, 17, 20, 23, 35, 59, 60, 64, 65, 66, 70, 72, 82, 83, 84, 87, 88, 92], "former": [14, 23, 46], "formula": 77, "forum": 88, "forward": [0, 1, 7, 10, 12, 14, 47, 76, 77, 78, 79, 87, 88, 90, 91, 92, 93, 94], "forward_loop": 68, "forward_with_cfg": 79, "forward_without_cfg": 79, "forwardasync": 1, "forwarddispatch": 1, "forwardsync": 1, "found": [3, 4, 5, 6, 7, 10, 14, 16, 20, 60, 62, 68, 70, 72, 75, 85, 94], "four": [3, 7, 10, 13, 24, 78], "fourth": 3, "fp": [85, 88], "fp16": [5, 9, 11, 13, 15, 19, 20, 23, 25, 59, 68, 72, 75, 77, 83, 86, 87, 88], "fp32": [0, 5, 24, 25, 59, 65, 77, 82, 83, 86, 87, 88], "fp4": [18, 25, 88], "fp8": [17, 18, 19, 21, 22, 23, 24, 25, 41, 49, 54, 59, 65, 68, 73, 75, 77, 80, 84, 86, 88, 89, 92], "fp8_block_scal": 65, "fp8_blockscale_gemm": 88, "fp8_inputs_overrid": 77, "fp8_kv_cach": [5, 85], "fp8_per_channel_per_token": 65, "fp8_qdq": 85, "fp8_rowwise_gemm_plugin": 25, "fp_valu": 5, "fpa_intb": 88, "fraction": [0, 26, 65, 77, 78, 82], "framework": [10, 12, 13, 16, 17, 66, 77, 88], "franc": [12, 14, 36, 38, 39, 41, 42, 43, 44, 45, 49, 54, 61, 62, 70, 76, 83, 87, 89], "free": [0, 1, 9, 14, 15, 26, 66, 74, 78, 79, 82, 84, 93], "free_gpu_memory_fract": [26, 32, 44, 48, 65, 76, 88], "free_resourc": [91, 93], "freed": 68, "freedom": 17, "freegpumemoryfract": [0, 84, 88], "freenumblock": 0, "french": 83, "freq": 77, "frequenc": [68, 78], "frequency_penalti": [65, 82, 88], "frequencypenalti": [0, 1, 6], "frequent": [8, 87], "friend": [0, 1, 68], "friendli": 77, "from": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 21, 23, 24, 25, 26, 27, 28, 32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 61, 62, 64, 65, 66, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 81, 82, 83, 84, 87, 88, 89, 90, 91, 92, 93, 94], "from_argu": 79, "from_checkpoint": [17, 79], "from_config": 79, "from_dict": [65, 79], "from_dir": 82, "from_engin": 82, "from_hugging_fac": [12, 15, 17, 79], "from_jax": 17, "from_json_fil": [65, 79], "from_kera": 17, "from_meta_ckpt": [17, 79], "from_nemo": [17, 79], "from_pretrain": 79, "from_prun": 79, "from_serialized_engin": 82, "from_str": 77, "fromfil": 14, "full": [0, 4, 5, 6, 8, 9, 10, 20, 21, 26, 66, 67, 68, 71, 77, 82, 83, 84, 87], "full_lik": 47, "fulli": [36, 88], "function": [0, 1, 3, 5, 11, 12, 14, 16, 17, 24, 64, 65, 66, 67, 75, 80, 82, 84, 85, 86, 87, 88, 93, 94], "further": [3, 4, 5, 10, 14, 19, 23, 25, 68, 72, 75, 92], "furthermor": [10, 24, 72], "fuse": [5, 10, 14, 24, 25, 75, 77, 83, 88, 90, 92], "fuse_a": 24, "fuse_fp4_qu": 25, "fuse_qkv_project": 79, "fuseattentionwithbiaspass": 7, "fused_gate_up_dora": 78, "fused_gate_up_lora": 78, "fusedgatedmlp": [77, 78], "fusevalu": 1, "fusion": [7, 25, 59, 66, 74, 84, 85, 88, 92], "fusion_op": 77, "futur": [2, 5, 6, 10, 15, 17, 23, 25, 36, 38, 39, 40, 41, 42, 43, 44, 45, 49, 54, 60, 61, 62, 64, 65, 66, 68, 70, 76, 77, 83, 84, 85, 88, 89], "fuyu": [86, 88], "g": [3, 15, 26, 47, 50, 51, 52, 65, 74, 82, 90], "g1": 74, "g2": 74, "gain": [71, 74], "gamma": 77, "gate": [9, 15, 25, 70, 77, 88], "gate_a": 77, "gate_a_bia": 77, "gate_bia": 77, "gate_proj": 15, "gate_x": 77, "gate_x_bia": 77, "gatedmlp": [77, 78], "gather": [0, 1, 25, 42, 43, 65, 77, 82], "gather_all_token_logit": [25, 88], "gather_context_logit": [25, 65, 79, 82], "gather_dim": [14, 77], "gather_generation_logit": [25, 65, 79, 82], "gather_last_token_logit": 77, "gather_nd": 77, "gather_output": 78, "gathercontext": [0, 88], "gatheredid": 1, "gatherel": 77, "gathergenerationlogit": 0, "gathermod": 77, "gathertre": 1, "gatherv2": 77, "gb": [2, 21, 60, 65, 68], "gb200": 88, "gcc": 60, "gd": 0, "gdrdma": 2, "geforc": 88, "gegelu": 77, "gegelu_limit": 78, "geglu": 77, "gelu": [77, 79], "gelu_pytorch_tanh": 88, "gelu_tanh": 78, "gemm": [7, 25, 74, 77, 83, 84, 88], "gemm_allreduc": 77, "gemm_allreduce_plugin": [25, 82], "gemm_fc1": 24, "gemm_plugin": [9, 11, 13, 14, 25, 68, 72, 75, 78, 83], "gemm_swiglu": 77, "gemm_swiglu_plugin": [25, 72, 80], "gemma": [17, 64, 85, 86, 88], "gemma2": 86, "gemma2_added_field": 79, "gemma2_config": 79, "gemma3": 88, "gemma3_added_field": 79, "gemma3_config": 79, "gemma_added_field": 79, "gemma_config_kwarg": 79, "gemmaconfig": 79, "gemmaforcausallm": 79, "gen": [65, 88], "genai": [23, 26, 58], "genattent": 24, "genenginepath": 0, "gener": [0, 1, 3, 6, 8, 10, 13, 14, 15, 17, 18, 19, 20, 22, 24, 25, 36, 37, 38, 46, 59, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 76, 77, 79, 82, 83, 84, 86, 87, 88, 89, 90, 91, 92, 93, 94], "generate_alibi_bias": 77, "generate_alibi_slop": 77, "generate_async": [32, 42, 43, 65, 88], "generate_logn_sc": 77, "generate_tllm_weight": 15, "generated_text": [36, 39, 49, 53, 61, 62, 70, 76, 83, 89], "generatedtokensperenginestep": 1, "generation_complet": 94, "generation_in_progress": 94, "generation_logit": [48, 65, 82], "generation_onli": 65, "generation_phas": 5, "generation_request": 94, "generation_to_complet": 94, "generationexecutor": [2, 88], "generationlength": 1, "generationlengthsdevic": 1, "generationlengthshost": 1, "generationlengthshostcopi": 1, "generationlogit": 0, "generationmixin": 79, "generationrequestid": 2, "generationresult": 65, "generationsequ": 82, "generationsess": [5, 82, 84], "generationstep": 1, "genericprompttuningparam": 1, "genert": 2, "genexecutorconfig": 0, "genidx": 0, "genrequest": 1, "genrespons": 2, "get": [0, 1, 2, 3, 5, 7, 9, 11, 15, 22, 26, 27, 28, 32, 36, 37, 60, 61, 62, 65, 66, 67, 70, 72, 77, 79, 82, 83, 87, 88, 89, 94], "get_1d_sincos_pos_embed_from_grid": 78, "get_2d_sincos_pos_emb": 78, "get_2d_sincos_pos_embed_from_grid": 78, "get_audio_featur": 82, "get_batch_cache_indic": 93, "get_batch_idx": 82, "get_block_offset": 82, "get_buff": 93, "get_comm": 65, "get_config_group": 79, "get_context_phase_param": 65, "get_device_cap": 54, "get_first_past_key_valu": 78, "get_hf_config": 79, "get_input": 7, "get_kv_cache_ev": [46, 65], "get_kv_cache_events_async": 65, "get_max_resource_count": [93, 94], "get_needed_resource_to_complet": [93, 94], "get_next_medusa_token": 82, "get_num_free_block": 93, "get_num_heads_kv": 82, "get_output": [7, 14], "get_par": [7, 77], "get_request_typ": 65, "get_rope_index": 82, "get_seq_idx": 82, "get_shap": 15, "get_slic": 15, "get_stat": [65, 88], "get_stats_async": 65, "get_timestep_embed": 78, "get_us": [7, 77], "get_visual_featur": 82, "get_vocab": [0, 3], "get_weight": 78, "getacceptancethreshold": 0, "getacceptedlengthscumsum": 1, "getacceptedpackedpath": 1, "getadditionalmodeloutput": 0, "getadditionaloutputnam": 0, "getaddress": 1, "getallnewtoken": 1, "getallottedtimem": 0, "getattentionconfig": 0, "getbackend": 0, "getbadword": 0, "getbatchingtyp": 0, "getbatchsizet": 0, "getbeamsearchbuff": 1, "getbeamsearchdiversityr": 0, "getbeamwidth": 0, "getbeamwidtharrai": 0, "getbuffermanag": 1, "getcachest": 0, "getcachetransceiverconfig": 0, "getcapac": 1, "getcapacityschedulerpolici": 0, "getclientid": 0, "getcommptr": 1, "getcommst": 0, "getcommunicationmod": 0, "getcommunicationtyp": 0, "getconfig": 0, "getconnect": 0, "getcontextchunkingpolici": 0, "getcontextexecutor": 0, "getcontextfmha": 1, "getcontextparallel": 1, "getcontextparallelgroup": 1, "getcontextparallelrank": 1, "getcontextphaseparam": 0, "getcopyonpartialreus": 0, "getcpu": 1, "getcpudiff": 1, "getcrossattentionmask": 0, "getcrosskvcachefract": 0, "getcudagraphcaches": 0, "getcudagraphmod": 0, "getcumlogprob": 1, "getdata": 0, "getdatatyp": [0, 1], "getdatatypenam": 1, "getdebugconfig": 0, "getdebuginputtensor": 0, "getdebugoutputtensor": 0, "getdebugtensornam": 0, "getdebugtensorsmaxiter": 0, "getdecodedurationm": 0, "getdecoderetentionprior": 0, "getdecoderst": 1, "getdecoderstream": 1, "getdecodingconfig": 0, "getdecodingmod": 0, "getdefaultbatchslot": 1, "getdefaulteaglechoic": 1, "getdevic": 1, "getdevicecacheperc": 0, "getdeviceid": 0, "getdeviceof": 1, "getdimens": 1, "getdrafttoken": 0, "getdynamicbatchconfig": 0, "getdynamicbatchmovingaveragewindow": 0, "getdynamictreemaxtopk": 0, "geteaglechoic": 0, "geteagleconfig": 0, "getearlystop": 0, "getembeddingbia": 0, "getembeddingt": 0, "getenablebatchsizetun": 0, "getenableblockreus": 0, "getenablechunkedcontext": 0, "getenablecontextfmhafp32acc": 0, "getenablemaxnumtokenstun": 0, "getenablepartialreus": 0, "getenabletrtoverlap": 0, "getencodedvocab": 0, "getencoderhiddens": 1, "getencoderinputfeatur": 0, "getencoderinputtokenid": 0, "getencoderoutputlength": 0, "getendid": 0, "geterrormsg": 0, "geteventbuffermaxs": 0, "getexecutionconfig": 1, "getextendedruntimeperfknobconfig": 0, "getexternaldrafttokensconfig": 0, "getfastlogit": 0, "getfinishedstep": 1, "getfinishedsum": 1, "getfinishreason": 1, "getfirstgentoken": 0, "getfreegpumemoryfract": 0, "getfrequencypenalti": 0, "getgatheredid": 1, "getgathergenerationlogit": 0, "getgemmallreducedtyp": 1, "getgenexecutor": 0, "getgpu": 1, "getgpudiff": 1, "getgpuspergroup": 1, "getgpuspernod": 1, "getgpuweightsperc": [0, 11], "getguid": 0, "getguideddecodingconfig": 0, "getguideddecodingparam": 0, "getguidetyp": 0, "gethiddens": 1, "gethostcaches": 0, "gethostmemori": 1, "getid": 1, "getinittozero": 1, "getinputtokenextraid": 0, "getinputtokenid": 0, "getinst": 1, "getipcunicastpoint": 1, "getisorchestr": 0, "getiterstatsmaxiter": 0, "getjointdecodinginput": 1, "getjointdecodingoutput": 1, "getkvcacheconfig": 0, "getkvcacheconfigref": 0, "getkvcacheeventmanag": 0, "getkvcacheretentionconfig": 0, "getkvcachetyp": 1, "getkvdatatyp": 1, "getlanguageadapteruid": 0, "getlastrank": 1, "getlatestdebugtensor": 0, "getlatestev": 0, "getlatestiterationstat": [0, 3], "getlatestrequeststat": 0, "getlayertyp": 1, "getlengthpenalti": 0, "getlevel": 1, "getlocalrank": 1, "getlogit": 0, "getlogitsdtyp": 1, "getlogitspostprocessor": 0, "getlogitspostprocessorconfig": 0, "getlogitspostprocessornam": 0, "getlogprob": 1, "getlookaheadconfig": 0, "getlookaheaddecodingconfig": 0, "getlookaheaddecodingmaxnumrequest": 0, "getloraconfig": 0, "getloramodul": 1, "getloraprefetchdir": 0, "getmanagedweightsmapopt": 1, "getmanageweightstyp": 1, "getmaxadapters": 0, "getmaxattentionwindowvec": 0, "getmaxbatchs": [0, 1], "getmaxbeamwidth": [0, 1], "getmaxdecodingdecodertoken": 1, "getmaxdecodingdrafttoken": 1, "getmaxdecodingenginetoken": 1, "getmaxdecodingtoken": 1, "getmaxdraftpathlen": 1, "getmaxencoderlen": 1, "getmaxinputlen": 1, "getmaxlorarank": 1, "getmaxnonleafnodesperlay": 1, "getmaxnumpath": 1, "getmaxnumtoken": [0, 1], "getmaxpagesperblock": 1, "getmaxpagesperblockdevic": 0, "getmaxpagesperblockhost": 0, "getmaxpathlen": 1, "getmaxpositionembed": 1, "getmaxpromptembeddingtables": 1, "getmaxqueues": 0, "getmaxseqidlemicrosecond": 0, "getmaxsequencelen": 1, "getmaxsequencelength": 1, "getmaxtoken": 0, "getmedusachoic": [0, 1], "getmemorytyp": [0, 1], "getmemorytypenam": 1, "getminp": 0, "getmintoken": 0, "getmlphiddens": 1, "getmodelconfig": [0, 1], "getmodelconfigmut": 1, "getmodelnam": 1, "getmodelvari": 1, "getmpist": 0, "getmropeconfig": 0, "getmropepositiondelta": 0, "getmroperotarycossin": 0, "getmultiblockmod": 0, "getmulticastpoint": 1, "getmultimodalembed": 0, "getnam": [0, 1], "getnbattentionlay": 1, "getnbhead": 1, "getnbkvhead": 1, "getnblay": 1, "getnbrnnlay": 1, "getnextdrafttoken": 1, "getnextdrafttokenslength": 1, "getngrams": 0, "getnoderank": 1, "getnoderankof": 1, "getnorepeatngrams": 0, "getnormalizelogprob": 0, "getnumcopystream": [0, 1], "getnumdecodingenginetoken": 1, "getnumdevicemodulelay": 0, "getnumensurework": 0, "getnumhostmodulelay": 0, "getnumkvheadsperlay": 1, "getnumkvheadsperlayerlocalrang": 1, "getnumlanguag": 1, "getnumnod": 0, "getnumpackedmask": 1, "getnumpag": 1, "getnumputwork": 0, "getnumresponsesreadi": 0, "getnumreturnbeam": [0, 1], "getnumreturnsequ": 0, "getnumtransformerlay": 1, "getonboardblock": 0, "getoptimaladapters": 0, "getoptprofilessplitpoint": 1, "getorchestratorconfig": 0, "getorchleadercomm": 0, "getoutputconfig": 0, "getpadid": 0, "getpagedcontextfmha": 1, "getpageptr": 1, "getpagewidth": 1, "getparallelconfig": 0, "getparentid": 1, "getparticipantid": 0, "getpath": 1, "getpathopt": 1, "getpeftcacheconfig": 0, "getperblockretentionprioritydur": 0, "getpin": 1, "getpinneddiff": 1, "getpinnedpool": 1, "getpinnedpooldiff": 1, "getpipelineparallel": 1, "getpipelineparallelgroup": 1, "getpipelineparallelrank": 1, "getpositionid": 0, "getposteriorthreshold": 0, "getppreducescatt": 1, "getprecis": 1, "getpresencepenalti": 0, "getprevdrafttokenslength": 1, "getprior": 0, "getprocessorbatch": 0, "getprocessormap": 0, "getprompttableoffload": 0, "getprompttuningconfig": 0, "getquantmod": 1, "getrank": 1, "getrecvpollperiodm": 0, "getrepetitionpenalti": 0, "getrepl": 0, "getreqid": 0, "getrequestid": 0, "getrequeststatsmaxiter": 0, "getrequesttyp": 0, "getresult": [0, 2, 3], "getreturnallgeneratedtoken": 0, "getrnnconfig": 1, "getrotaryembeddingdim": 1, "getruntimedefault": 1, "getruntimetyp": 0, "getsamplingconfig": [0, 1], "getschedulerconfig": 0, "getschedulerconfigref": 0, "getse": 0, "getsecondaryoffloadminprior": 0, "getselfidx": 0, "getsequencelength": 1, "getserializedst": 0, "getshap": [0, 1], "getsinktokenlength": 0, "getsiz": [0, 1], "getsizeinbit": 1, "getsizeinbyt": [0, 1], "getsizeperhead": 1, "getskipcrossattnblock": 0, "getslotsperpag": 1, "getsocketst": 0, "getspawnprocess": 0, "getspecdecconfig": 0, "getspeculativedecodingmod": 1, "getspeculativedecodingmodul": 1, "getspeculativedecodingmoduleptr": 1, "getstat": 0, "getstatu": 1, "getstoptokenid": 0, "getstopword": 0, "getstream": [0, 1], "getsumlocalkvhead": 1, "gettag": 0, "gettaskid": 0, "gettemperatur": 0, "gettensorparallel": 1, "gettensorparallelgroup": 1, "gettensorparallelrank": 1, "getter": 6, "gettoken": 0, "gettokenizerstr": 0, "gettokenrangeretentionconfig": 0, "gettokensperblock": 1, "gettopk": 0, "gettopp": 0, "gettoppdecai": 0, "gettoppmin": 0, "gettoppresetid": 0, "gettotalnumpag": 1, "gettyp": 1, "getunderlyingdecod": 1, "getunicastpoint": 1, "getusegpudirectstorag": 0, "getuvm": 1, "getuvmdiff": 1, "getverificationsets": 0, "getvers": 1, "getvocabs": 1, "getvocabsizepad": 1, "getweight": 0, "getwindows": 0, "getworkerexecutablepath": 0, "getworlds": 1, "gh200": 88, "ghost": 46, "ghz": 40, "gib": [8, 84], "gid": 0, "gigabyt": 21, "git": [9, 18, 60, 64, 83, 87, 89], "github": [17, 18, 24, 60, 64, 66, 83, 88, 89], "give": [3, 66, 72, 74, 79], "given": [0, 1, 3, 6, 9, 15, 17, 21, 64, 65, 67, 73, 74, 77, 78, 79, 81, 82, 84, 85, 88, 93], "givyboi": 53, "glm": [64, 77, 86, 88], "glm4": [64, 88], "global": [0, 5, 14, 24, 88], "global_max_input_length": 82, "global_max_output_length": 82, "globalrequestid": 0, "glossari": [19, 22], "gm": 87, "gnu": 60, "go": [5, 6, 46, 71, 88], "goal": 76, "goe": [64, 68], "good": [3, 14, 18, 68, 71, 74, 75], "got": [0, 38, 40, 41, 42, 43, 44, 45, 46, 47, 53, 54, 64, 68, 87], "gpqa": 24, "gpt": [1, 5, 10, 14, 16, 20, 23, 25, 59, 64, 68, 77, 84, 85, 86, 87, 88], "gpt2": [79, 87], "gpt3": 21, "gpt_attent": [5, 7, 22, 77, 83, 88], "gpt_attention_plugin": [9, 14, 25, 68, 78, 82, 87, 88], "gpt_attention_plugin_remove_pad": 7, "gpt_variant": [79, 88], "gptattent": 7, "gptattentionpluginremovepaddingrewritepass": 7, "gptconfig": 79, "gptdecod": 6, "gptdecoderbatch": 88, "gptdecoderptr": 1, "gptforcausallm": 79, "gptj": 79, "gptjconfig": 79, "gptjforcausallm": 79, "gptjmodel": 79, "gptlmheadmodel": 87, "gptmanag": 88, "gptmanagerbenchmark": [8, 60, 88], "gptmodel": 79, "gptmodelconfig": 88, "gptneoxforcausallm": 79, "gptneoxmodel": 79, "gptq": [23, 59, 86, 88], "gptsession": 88, "gptsessionbenchmark": 88, "gpu": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 13, 16, 17, 20, 21, 22, 23, 25, 26, 32, 50, 51, 52, 54, 59, 60, 62, 64, 65, 69, 70, 71, 72, 75, 77, 79, 82, 83, 86, 87, 88, 91, 92], "gpu_weights_perc": [11, 82], "gpumemusag": [0, 26], "gpus_per_nod": [25, 26, 65], "gpuspernod": [1, 6], "gpusync": 1, "gpuweightsperc": [0, 11], "gqa": [5, 19, 22, 25, 77, 88, 92], "grace": [8, 59, 86], "gradient": 20, "gradual": 17, "grain": 7, "gram": 10, "grammar": [0, 3, 65], "granit": [86, 88], "graph": [0, 14, 18, 59, 65, 67, 68, 77, 82, 83, 84, 87, 88, 92, 93], "graph_rewrit": 7, "graphic": 48, "gre": 26, "great": [19, 48], "greater": [0, 2, 5, 22, 23, 24, 25, 77], "greatli": [8, 17, 72, 75], "greedi": [0, 6, 91], "greedy_sampl": [39, 65], "greedysampl": 0, "greedysamplinghost": 1, "grid": [14, 72, 74, 77, 78], "grid_search_engin": 70, "grid_siz": 78, "grok": [86, 88], "ground": 67, "groundbreak": 66, "group": [0, 3, 4, 6, 14, 19, 59, 65, 77, 78, 85, 88, 92], "group_cl": 79, "group_norm": 77, "group_siz": [13, 65, 77], "groupedrmsnorm": 24, "groupnorm": [77, 78], "grow": [1, 10, 74], "gt": 77, "gtc": [18, 24], "guarante": [0, 6, 8, 17, 68, 69, 70, 72, 76], "guaranteed_no_evict": [0, 65, 68, 76], "guaranteednoevictschedul": 94, "guard": [46, 70], "guid": [0, 14, 18, 23, 36, 37, 59, 64, 65, 66, 67, 69, 70, 71, 72, 75, 77, 87, 88, 92], "guidanc": [10, 26, 75, 78, 79], "guided_decod": [40, 65], "guided_decoding_backend": [40, 65], "guideddecodingbackend": 0, "guideddecodingconfig": [0, 3], "guideddecodingparam": [0, 3, 40, 65], "guidelin": [2, 71], "guidetyp": [0, 3], "gw": 7, "h": [2, 3, 5, 10, 15, 25, 26, 29, 30, 31, 70, 77, 79, 83, 88], "h1": 77, "h100": [17, 23, 25, 66, 69, 70, 72, 73, 74, 88], "h20": 25, "h200": [20, 25, 69, 88], "h2d": 47, "ha": [0, 1, 3, 5, 8, 9, 13, 14, 15, 17, 18, 19, 23, 24, 25, 28, 60, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 82, 84, 85, 87, 88, 91, 93, 94], "had": [17, 72, 74], "half": [0, 1, 14, 70, 77], "halv": [20, 77], "hand": [8, 10, 16, 71], "handl": [1, 2, 4, 15, 17, 19, 24, 70, 72, 74, 75, 76, 77, 78, 90, 91], "handle_per_step": 82, "hang": [0, 64, 87, 88], "happen": [3, 6, 8, 14, 62, 84, 87], "happi": 82, "hard": 5, "harder": 6, "hardwar": [23, 32, 59, 60, 88], "has_affin": 77, "has_bia": 77, "has_config_group": 79, "has_position_embed": 82, "has_scal": 77, "has_token_type_embed": 82, "has_zero_point": [13, 65], "hascontextawaitthread": 0, "hasdraftlogit": 1, "haserror": [0, 3], "hasgenawaitthread": 0, "hash": [0, 65], "hasresult": 0, "hasrnnconfig": 1, "hasspeculativedecodingmodul": 1, "hattizai": 88, "have": [0, 1, 3, 4, 5, 6, 8, 9, 10, 13, 14, 15, 17, 18, 19, 21, 23, 24, 25, 27, 46, 49, 50, 51, 52, 53, 62, 64, 65, 66, 67, 68, 70, 71, 72, 73, 74, 75, 76, 77, 82, 83, 84, 86, 87, 88, 90], "hbm3": 69, "hbm3e": 21, "he": 46, "head": [1, 6, 10, 14, 19, 25, 49, 54, 59, 68, 77, 78, 88, 92], "head_dim": [92, 93], "head_siz": [5, 77, 79, 82, 88], "header": 2, "headsiz": 77, "headsperlay": 1, "health": [26, 53], "heat": 6, "heavi": 75, "heavier": 71, "height": [35, 78, 82], "hello": [36, 38, 39, 41, 42, 43, 44, 45, 47, 49, 50, 53, 54, 61, 62, 70, 76, 83, 89], "help": [2, 3, 5, 7, 14, 24, 25, 26, 29, 30, 40, 47, 49, 55, 56, 60, 67, 68, 69, 70, 73, 74, 75, 76, 77, 83, 88, 91], "helper": [1, 77], "henc": 90, "here": [2, 3, 7, 9, 11, 12, 13, 14, 15, 17, 18, 20, 21, 26, 28, 32, 36, 40, 60, 67, 70, 71, 72, 74, 75, 77, 82, 83, 84, 85, 87, 89, 92, 93, 94], "heterogen": 2, "heurist": [5, 68, 77, 88], "hf": [6, 9, 11, 15, 25, 26, 41, 42, 43, 44, 45, 49, 50, 51, 52, 54, 68, 69, 70, 82, 86, 87, 89], "hf_config_or_dir": 79, "hf_lora_convert": 9, "hf_model": [68, 79], "hf_model_dir": [11, 12, 13, 17, 79], "hf_model_nam": 68, "hf_model_or_dir": 79, "hf_quant_config": 68, "hf_token": 68, "hfconfigordir": 79, "hgx": 21, "hi": 9, "hidden": [0, 3, 4, 5, 6, 9, 10, 24, 65, 77, 78, 88], "hidden_act": [13, 78, 79], "hidden_dim": [0, 5, 77], "hidden_dim_per_head": [5, 77], "hidden_dtyp": 78, "hidden_s": [0, 7, 13, 15, 77, 78, 79, 82, 90, 92], "hidden_size_in": 9, "hidden_size_out": 9, "hidden_size_per_head": 77, "hidden_st": [12, 77, 78, 79, 82, 87, 90], "hidden_states_for_emb": 79, "hiddens": [0, 1, 6], "hide": 24, "hierarch": 13, "hierarchi": [17, 77], "high": [3, 10, 12, 14, 17, 19, 23, 24, 64, 68, 76, 77, 84, 88], "higher": [0, 1, 5, 6, 8, 9, 10, 15, 19, 20, 22, 66, 69, 76, 84, 88, 90], "highest": [6, 7, 20, 21], "highli": [10, 14, 67, 72], "highlight": [20, 23, 72, 74], "himself": 46, "hint": [68, 77], "hit": [0, 69, 74, 75, 88], "hk": 10, "ho": 9, "hoc": [17, 82], "hold": [0, 1, 3, 4, 7, 8, 9, 10, 65, 71, 78, 84, 91], "home": [18, 54, 68], "homo_head_pattern": 78, "homogen": 2, "hope": 24, "hopper": [5, 8, 18, 19, 20, 23, 25, 59, 60, 66, 72, 86, 88], "horatio": 46, "horizont": 25, "host": [1, 9, 26, 28, 33, 47, 52, 59, 60, 65, 75, 77, 88], "host_cache_s": 65, "host_context_length": [77, 78, 79, 82, 87], "host_context_progress": [77, 78, 87], "host_cross_kv_cache_block_offset": [78, 82], "host_cross_kv_cache_pool_map": 78, "host_cross_kv_cache_pool_point": 78, "host_kv_cache_block_offset": [77, 78, 82, 87], "host_kv_cache_block_point": 87, "host_kv_cache_pool_map": [77, 78, 87], "host_kv_cache_pool_point": [77, 78, 87], "host_max_attention_window_s": [77, 78, 87], "host_past_key_value_length": [77, 78, 87], "host_request_typ": [77, 78, 79, 87], "host_runtime_perf_knob": [77, 78, 87], "host_sink_token_length": [77, 78, 87], "hostcaches": [0, 8], "hostmemori": 1, "hostnam": 26, "hour": 70, "hous": 71, "how": [0, 2, 3, 10, 12, 14, 15, 17, 25, 28, 36, 50, 59, 64, 65, 67, 70, 72, 73, 75, 77, 83, 84, 85, 87, 89, 91, 92], "howev": [2, 3, 5, 10, 17, 18, 19, 24, 26, 68, 71, 72, 74, 75, 76, 84, 88, 90, 91], "hpc": 20, "html": [1, 77, 87], "http": [0, 1, 4, 9, 17, 18, 24, 25, 26, 29, 30, 31, 55, 56, 57, 60, 61, 62, 64, 77, 83, 85, 87, 88, 89], "hub": [16, 53, 65, 68, 83, 88, 89], "hug": [3, 9, 11, 16, 17, 32, 65, 68, 79, 83, 88], "huggingfac": [0, 9, 12, 13, 15, 17, 18, 30, 53, 56, 64, 68, 69, 70, 83, 86, 87, 88, 90], "huggingface_exampl": 89, "huggingface_hub": 53, "huggingface_model_card": 89, "human": [24, 68], "hurt": 75, "hw": 24, "hybrid": [4, 88], "hyper": 13, "hypothesi": 10, "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 25, 26, 28, 29, 31, 32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 57, 60, 61, 62, 64, 65, 66, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94], "ia3": 5, "iactivationlay": 14, "ibrahimamin1": 88, "ibufferptr": 1, "iconstantlay": 77, "icudaengin": [82, 84], "id": [0, 1, 3, 8, 32, 43, 65, 68, 69, 77, 78, 82, 83, 92, 93], "idea": [9, 75], "ideal": [7, 72, 74, 88], "ident": [3, 8, 25, 77], "identifi": [0, 6, 9, 10, 14, 68, 74, 77], "idl": 0, "idtyp": [0, 3], "idx": 82, "ieee": 85, "ieinsumlay": 77, "ielementwiselay": 77, "iexecutioncontext": [82, 84], "ifb": [10, 88], "ifilllay": 77, "igatherlay": 77, "ignor": [25, 65, 68, 77, 82], "ignore_eo": [65, 88], "igptdecod": 1, "ihostmemori": [1, 14, 82], "ii": [5, 77], "ij": 77, "ijk": 77, "ijl": 77, "ik": 77, "ikl": 77, "ilay": [7, 14], "illustr": [7, 10, 16, 24], "ilogg": 1, "ilooplay": 77, "imag": [26, 30, 35, 50, 51, 52, 56, 59, 61, 62, 68, 78, 82, 88], "image64": 56, "image_grid_thw": 82, "image_patches_indic": 82, "image_path": 82, "image_s": 79, "image_token_index": 82, "image_url": [30, 56], "imatrixmultiplylay": 77, "imbal": 74, "immedi": [5, 10, 66, 70, 87], "immut": 1, "impact": [10, 19, 23, 24, 26, 53, 71, 72, 74, 75, 76], "imped": 23, "impl": [0, 94], "implement": [2, 3, 5, 6, 10, 13, 14, 16, 17, 19, 47, 59, 66, 77, 78, 83, 85, 86, 87, 88, 90, 91, 93, 94], "implicit": [1, 5, 10, 77], "implicitli": 1, "import": [10, 15, 17, 19, 23, 26, 32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 54, 55, 56, 57, 59, 61, 62, 70, 72, 74, 75, 76, 83, 86, 88, 89, 90, 91, 93], "impos": 23, "improv": [5, 8, 14, 19, 20, 21, 22, 23, 24, 25, 38, 41, 42, 44, 45, 59, 66, 68, 69, 70, 72, 73, 74, 75, 88, 89, 92], "in_channel": 78, "in_featur": [13, 14, 78], "in_hidden_s": 77, "in_len": 7, "in_point": 77, "in_progress": 82, "includ": [0, 1, 2, 3, 5, 6, 8, 9, 10, 13, 14, 15, 16, 19, 20, 22, 25, 26, 32, 40, 46, 54, 60, 62, 64, 65, 66, 72, 75, 77, 83, 85, 87, 88, 91, 92, 93, 94], "include_stop_str_in_output": 65, "inclus": 77, "incompat": [25, 88, 89], "incorpor": [0, 24, 66, 88], "incorrect": [8, 10, 88], "increas": [0, 5, 8, 10, 14, 18, 20, 21, 24, 25, 67, 68, 70, 72, 75, 76, 77, 88, 94], "incred": 66, "increment": [60, 88], "incur": [14, 24], "inde": 84, "independ": [0, 1, 2, 3, 10, 77], "index": [0, 1, 3, 10, 15, 24, 32, 48, 59, 61, 62, 65, 77, 82, 83, 88, 92], "index_select": 77, "indic": [0, 1, 3, 5, 6, 10, 13, 65, 76, 77, 78, 82, 84, 93], "indim": 1, "indimfirst": 1, "indirect": 1, "individu": [24, 88], "indivis": 88, "industri": 68, "ineffici": [5, 24], "inetworkdefinit": [7, 14, 77], "inevit": 14, "inf": 47, "infeas": 3, "infer": [0, 2, 6, 9, 10, 14, 16, 17, 18, 19, 20, 21, 24, 25, 30, 56, 59, 64, 67, 69, 70, 71, 72, 73, 75, 76, 77, 82, 85, 87, 88, 91], "infer_shap": 82, "inferencerequest": 88, "infin": 28, "infinit": [14, 68, 69], "inflat": 24, "inflight": [0, 5, 9, 10, 26, 63, 65, 68, 73, 74, 77, 88, 92, 94], "inflight_request_id": 94, "inflightbatch": 0, "inflightbatchingstat": [0, 26], "influenc": [24, 75], "info": [0, 25, 26, 68, 84, 87], "inform": [0, 1, 2, 3, 5, 6, 10, 13, 14, 19, 22, 24, 26, 59, 66, 68, 70, 86, 87, 88], "infti": 6, "inherit": [15, 17, 77, 90, 91, 93, 94], "init": [1, 18, 60, 88], "init_audio_encod": 82, "init_image_encod": 82, "init_llm": 82, "init_processor": 82, "init_token": 82, "initi": [1, 2, 10, 15, 24, 47, 65, 68, 72, 74, 75, 84, 87, 88, 90, 92, 94], "initializer_list": [0, 1], "initmemorypool": 84, "inittozero": 1, "inlin": [0, 1], "inner": 77, "inner_layernorm": [78, 79], "inp": 77, "inpaint": [30, 56], "inprogress": 1, "input": [0, 1, 3, 6, 7, 8, 9, 10, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 32, 34, 35, 51, 56, 59, 64, 65, 67, 68, 69, 70, 71, 73, 75, 76, 77, 78, 79, 82, 84, 86, 87, 88, 90, 91, 92, 94], "input_1": 77, "input_1_": 77, "input_audio": 82, "input_featur": 79, "input_fil": 88, "input_id": [8, 12, 24, 68, 77, 79, 82, 87, 90], "input_imag": 82, "input_layernorm": [12, 13, 15, 90], "input_length": [77, 78, 79, 82], "input_list": 77, "input_n": 77, "input_n_": 77, "input_text": [12, 14, 82, 83], "input_timing_cach": [25, 65], "input_token_extra_id": 82, "inputbuff": 1, "inputdesc": 14, "inputdtyp": 1, "inputgentokenshost": 1, "inputlen": 1, "inputpack": [1, 6], "inputs_emb": 90, "inputtokenextraid": 0, "inputtokenid": 0, "insert": [7, 14, 68, 77], "insertinputtensor": 1, "insid": [1, 10, 15, 17, 18, 60, 62, 77, 84, 92], "insight": 24, "insiz": 1, "inspect": [25, 67, 84], "instabl": 2, "instal": [17, 26, 27, 50, 51, 52, 60, 64, 70, 83, 88, 90], "instanc": [0, 2, 3, 6, 7, 10, 14, 24, 32, 47, 64, 65, 82, 84, 88, 92], "instance_idx": 87, "instanti": [70, 76, 93], "instead": [7, 8, 10, 14, 17, 18, 19, 32, 60, 75, 76, 77, 84, 88], "instruct": [10, 18, 26, 30, 35, 41, 56, 60, 68, 69, 70, 71, 75, 76, 83, 86, 88, 89, 90], "int": [0, 1, 6, 12, 13, 14, 17, 43, 47, 65, 74, 77, 78, 79, 82, 90, 92, 93, 94], "int32": [1, 5, 25, 77, 80, 87], "int32_t": [0, 1, 77], "int4": [15, 17, 23, 25, 32, 54, 59, 86, 88], "int4_weight": 85, "int64": [1, 6, 77, 87], "int64_t": [0, 1], "int8": [1, 13, 15, 17, 23, 25, 59, 65, 72, 77, 84, 86, 88], "int8_kv_cach": [5, 85, 88], "int8_t": [0, 1], "int8_weight": 85, "int8awq": 72, "int_clip": 77, "integ": [5, 65, 68, 77, 85, 88], "integr": [10, 88, 91, 92, 93, 94], "intellig": 66, "intend": 84, "intent": 70, "intention": 17, "intenum": 77, "inter": [2, 70, 71, 72, 74, 75, 87, 88], "inter_layernorm": 79, "inter_s": 15, "interact": [3, 10, 66, 83, 87], "interchang": 64, "interconect": 71, "interconnect": [6, 70, 71, 72, 74, 75], "interest": 68, "interfac": [14, 17, 70, 82, 88, 90, 91], "interfer": 87, "interleav": [5, 14], "intermedi": [5, 14, 65, 87], "intermediate_s": [13, 79], "intern": [1, 3, 5, 17, 18, 24, 70, 73, 84, 87, 93], "internal_error": [25, 26], "internlm": [64, 85, 86, 88], "internlm2": [85, 86, 88], "internvl2": 88, "interpol": 77, "interpolation_scal": 78, "interpret": [3, 60, 74], "intersect": 2, "intertwin": 75, "intflag": [79, 81], "intpsplitdim": 1, "intra": 71, "introduc": [17, 20, 24, 28, 85, 88], "introduct": [73, 83, 88], "intuit": [66, 73], "inv": 77, "inv_freq": 77, "invalid": [87, 88], "inventori": 68, "invers": 5, "invest": 68, "investig": [18, 88], "invit": 54, "invoc": 88, "invok": [0, 3, 7, 64, 87, 94], "invokequant": 14, "involv": [0, 1, 2, 10, 14, 23, 78, 91, 92, 93], "io": [5, 27, 28, 84, 88], "ip": [0, 88], "ipc": 60, "ipc_uc_handl": 1, "ipc_uc_ptr": 1, "ipc_uc_va": 1, "ipcmemori": 1, "ipcnvl": 1, "ipcnvlsalloc": 1, "ipcnvlsfre": 1, "ipcnvlshandl": 1, "ipcnvlssupport": 1, "ipluginv3lay": 77, "ireducelay": 77, "irrespect": [0, 6, 47, 65], "is_alibi": 77, "is_caus": 78, "is_const_v": 1, "is_cuda_graph": 92, "is_def": 77, "is_dora": 9, "is_dynam": 77, "is_enc_dec": 82, "is_expert": 78, "is_gated_activ": 77, "is_gemma_2": 79, "is_gemma_3": 79, "is_loc": 78, "is_medusa_mod": 82, "is_mla_en": 77, "is_mla_enabled_flag": 77, "is_module_excluded_from_quant": 65, "is_mrop": 77, "is_network_input": 77, "is_orchestrator_mod": 82, "is_qkv": 78, "is_redrafter_mod": 82, "is_rop": 77, "is_trt_wrapp": 77, "is_valid": 78, "is_valid_cross_attn": 78, "isauto": 0, "isbeamsearch": 0, "iscontextparallel": 1, "iscontinuouskvcach": 1, "iscrossattent": 1, "isdon": 1, "isdora": 1, "isdrafttokensextern": 1, "iseagl": [0, 1], "iselectlay": 77, "isexplicitdrafttoken": [0, 1], "isexternaldrafttoken": 0, "isfin": [0, 3], "isfirstcontextparallelrank": 1, "isfirstpipelineparallelrank": 1, "isfirsttensorparallelrank": 1, "isgreedysampl": 0, "ishufflelay": 77, "iskvcacheen": 1, "isl": [0, 19, 20, 21, 22, 24, 68, 69, 75], "islastpipelineparallelrank": 1, "isleg": 0, "islicelay": 77, "isload": 1, "islookahead": 0, "islookaheaddecod": 1, "ismedusa": [0, 1], "ismpist": 0, "ismultimod": 1, "isn": 87, "isnon": 1, "isoftmaxlay": 77, "isorchestr": 0, "ispagedkvcach": 1, "isparticip": [0, 88], "ispipelineparallel": 1, "ispoint": 1, "isrnnbas": 1, "issequencefin": [0, 3], "issocketst": 0, "issu": [5, 14, 17, 53, 59, 60, 62, 64, 68, 69, 70, 77, 87], "istensorparallel": 1, "isthreadsaf": 0, "istopk": 0, "istopkandtopp": 0, "istopkortopp": 0, "istopp": 0, "istransformerbas": 1, "istream": [0, 1], "isunsign": 1, "isusebantoken": 0, "isusebanword": 0, "isuseexpliciteosstop": 0, "isusefrequencypenalti": 0, "isusemaxlengthstop": 0, "isuseminlength": 0, "isuseminp": 0, "isusenorepeatngrams": 0, "isuseoccurrencepenalti": 0, "isusepenalti": 0, "isusepresencepenalti": 0, "isuserepetitionpenalti": 0, "isusestopcriteria": 0, "isusestopword": 0, "isusetemperatur": 0, "isusevariablebeamwidthsearch": 0, "iswhisp": 1, "ite": 82, "item": [0, 3, 82], "itensor": [0, 77], "itensorbind": 1, "itensorptr": 1, "iter": [0, 1, 3, 5, 10, 15, 24, 26, 65, 66, 68, 70, 74, 75, 76, 82, 88], "iter_stats_max_iter": 65, "iterationresult": 65, "iterationstat": 0, "iterationtyp": 0, "iterlatencym": [0, 26], "iterlatencymillisec": 88, "iterstat": 0, "iterstatsmaxiter": 0, "iterstatsvec": 0, "ith": 77, "itl": [72, 75, 88], "its": [0, 1, 3, 5, 6, 7, 11, 13, 14, 15, 17, 19, 21, 24, 40, 64, 66, 68, 71, 73, 74, 75, 77, 84, 91, 92, 94], "itself": [3, 82], "itsuji": 68, "iunarylay": 77, "j": [5, 6, 20, 23, 50, 51, 52, 64, 68, 77, 85, 86, 88], "jacobi": 10, "jai": 88, "jamesthez": 88, "jane": 54, "janpetrov": 88, "japanes": [9, 68], "jax": [13, 17], "ji": 77, "jit": [18, 62, 88], "jj": 77, "jk": 77, "jl749": 88, "job": [14, 51, 52], "joint_attention_kwarg": 79, "joint_attn_forward": 78, "journei": [24, 66], "jpg": 68, "json": [0, 1, 3, 13, 26, 29, 30, 31, 34, 35, 40, 47, 65, 67, 68, 83, 88], "json_object": 65, "jsonconfigstr": 0, "jsonl": 68, "jsonseri": 0, "just": [0, 1, 10, 50, 51, 52, 53, 62, 68, 70, 76, 82, 84], "justic": [38, 41, 42, 44, 45, 53], "k": [1, 5, 6, 9, 10, 16, 24, 65, 77, 85, 87, 88, 90, 92], "k_b_proj_tran": 77, "k_dim": 77, "k_proj": [15, 90], "kattent": 1, "kattn_dens": 1, "kattn_k": 1, "kattn_q": 1, "kattn_qkv": 1, "kattn_v": 1, "kauto": 0, "kbatchedpostprocessornam": [0, 3], "kbeamsearch": 0, "kbf16": 0, "kbool": [0, 1], "kbyte_typ": 1, "kc_cache_retention_config": 88, "kcancel": 0, "kchatglm": 1, "kcontext": 1, "kcontext_in_progress": 0, "kcontinu": 1, "kcpu": [0, 1], "kcpu_pin": 0, "kcpu_pinnedpool": 0, "kcross_attn_dens": 1, "kcross_attn_k": 1, "kcross_attn_q": 1, "kcross_attn_qkv": 1, "kcross_attn_v": 1, "kdatatyp": 1, "kdecoder_onli": [0, 11], "kdefault": 0, "kdefault_num_tokens_per_block": 1, "kdefaultbatchsizet": 0, "kdefaultdynamicbatchmovingaveragewindow": 0, "kdefaultgpuspernod": 1, "kdefaultiterstatsmaxiter": 0, "kdefaultlookaheaddecodingngram": 0, "kdefaultlookaheaddecodingverificationset": 0, "kdefaultlookaheaddecodingwindow": 0, "kdefaultmaxadapters": 0, "kdefaultmaxpagesperblockdevic": 0, "kdefaultmaxpagesperblockhost": 0, "kdefaultmaxseqidlemicrosecond": 0, "kdefaultoptimaladapters": 0, "kdefaultprior": 0, "kdefaultrequeststatsmaxiter": 0, "kdefaultretentionprior": 0, "kdisabl": 1, "kdrafttokensextern": 1, "kdynamicpostprocessornameprefix": 0, "keagl": [0, 1], "kebnf_grammar": [0, 3], "keep": [0, 5, 6, 17, 24, 65, 69, 76, 77, 88], "keepdim": 77, "kei": [0, 2, 3, 8, 14, 19, 23, 59, 68, 69, 74, 79, 82, 87, 91, 92, 93], "kenabl": 1, "kencdec": 1, "kencoder_decod": 0, "kencoder_in_progress": 0, "kencoder_onli": 0, "kend_id": 0, "kept": [5, 17, 77], "kequal_progress": 0, "kera": 17, "kernel": [1, 5, 8, 14, 19, 25, 47, 62, 66, 67, 72, 75, 77, 82, 83, 84, 87, 88], "kernel_s": [77, 78], "kexplicitdrafttoken": [0, 1], "kexternaldrafttoken": 0, "key_length": [77, 78], "keyvaluecacheparam": [78, 79], "keyword": [15, 65, 77, 84], "kfirst_come_first_serv": 0, "kfloat": [1, 14], "kfp16": 0, "kfp32": [0, 65], "kfp8": 0, "kgener": 1, "kgeneration_complet": 0, "kgeneration_in_progress": 0, "kglm": 1, "kgpt": 1, "kgpu": [0, 1], "kguaranteed_no_evict": 0, "khalf": 1, "kind": [4, 5, 7, 24, 94], "kinflight": 0, "king": 46, "kint32": [0, 1], "kint64": [0, 1], "kint8": [0, 1], "kinvalid": 1, "kispoint": 1, "kisunsign": 1, "kj": 77, "kjson": [0, 3], "kjson_schema": [0, 3], "kleader": [0, 2], "klength": 0, "klinear": 1, "klookahead": 0, "klookaheaddecod": 1, "kmamba": 1, "kmax_util": 0, "kmaxretentionprior": 0, "kmedusa": [0, 1], "kminretentionprior": 0, "kmla": 0, "kmlp_4h_to_h": 1, "kmlp_gate": 1, "kmlp_gate_up": 1, "kmlp_h_to_4h": 1, "kmlp_router": 1, "kmoe_4h_to_h": 1, "kmoe_gat": 1, "kmoe_h_to_4h": 1, "kmoe_rout": 1, "kmpi": 0, "knegativeinfin": 1, "knob": [0, 65, 76, 77], "knone": 1, "knoop": 1, "knot_finish": 0, "know": [6, 67, 76, 77], "knowledg": 59, "known": [5, 10, 14, 59, 62, 77, 86], "knumflag": 0, "kopt_profiles_split_point": 1, "korchestr": [0, 2], "kosmo": [86, 88], "kpage": 1, "kpin": 1, "kpinnedpool": 1, "kqueu": 0, "krecurr": 1, "krecurrentgemma": 1, "kregex": [0, 3], "kstatic": 0, "kstatic_batch": 0, "kstop_word": 0, "kstructural_tag": 0, "ktimed_out": 0, "ktopk": 0, "ktopktopp": 0, "ktopp": 0, "ktrtpointertyp": 1, "kuint8": [0, 1], "kunderlyingtyp": 1, "kunish": 9, "kunknown": 0, "kunsign": 1, "kusebantoken": 0, "kusebanword": 0, "kuseexpliciteosstop": 0, "kusefrequencypenalti": 0, "kusemaxlengthstop": 0, "kuseminlength": 0, "kuseminp": 0, "kusenorepeatngrams": 0, "kuseoccurrencepenalti": 0, "kusepenalti": 0, "kusepresencepenalti": 0, "kuserepetitionpenalti": 0, "kusestandardstopcriteria": 0, "kusestopword": 0, "kusetemperatur": 0, "kusevariablebeamwidthsearch": 0, "kuvm": [0, 1], "kv": [0, 1, 2, 3, 9, 14, 17, 19, 23, 25, 26, 32, 36, 37, 39, 49, 59, 63, 65, 66, 68, 69, 70, 74, 77, 82, 83, 88, 89, 90, 91, 92, 94], "kv_b_proj": 77, "kv_cach": 0, "kv_cache_block_offset": [77, 78, 82, 87], "kv_cache_block_point": 87, "kv_cache_config": [26, 32, 39, 44, 46, 48, 49, 65, 76, 93], "kv_cache_dtyp": [46, 68, 72, 81, 93], "kv_cache_enable_block_reus": [82, 88], "kv_cache_free_gpu_mem_fract": [18, 69, 76], "kv_cache_free_gpu_memory_fract": [26, 33, 82, 88], "kv_cache_host_memory_byt": 8, "kv_cache_manag": [0, 88, 91, 92, 93, 94], "kv_cache_param": [78, 79, 92], "kv_cache_quant_algo": [13, 54, 65, 68, 72], "kv_cache_quant_mod": [5, 77], "kv_cache_retention_config": 65, "kv_cache_scaling_factor": [5, 13], "kv_cache_typ": [14, 25, 65, 82, 88], "kv_dtype": 79, "kv_event": 46, "kv_head": 78, "kv_host_cache_byt": 8, "kv_lora_rank": [77, 78], "kv_orig_quant_scal": 77, "kv_quant_orig_scal": 77, "kvalue_status_load": 1, "kvalue_status_miss": 1, "kvalue_status_process": 1, "kvcach": [0, 24, 39, 49, 88], "kvcacheconfig": [0, 5, 8, 32, 39, 44, 46, 48, 49, 65, 76, 84], "kvcachecreateddata": [0, 65], "kvcacheev": 0, "kvcacheeventdata": 0, "kvcacheeventdiff": 0, "kvcacheeventmanag": 0, "kvcachehitr": 0, "kvcachehitrateperrequest": 0, "kvcacheindex": 1, "kvcachemanag": [0, 5, 8, 82, 92, 93], "kvcachemetr": 0, "kvcacheparam": 92, "kvcacheremoveddata": [0, 65], "kvcacheretentionconfig": [0, 65], "kvcaches": 0, "kvcachestat": [0, 26], "kvcachestoredblockdata": 0, "kvcachestoreddata": [0, 65], "kvcachetransferend": 0, "kvcachetransferm": 0, "kvcachetransferstart": 0, "kvcachetyp": [1, 65, 82], "kvcachetypefromstr": 1, "kvcacheupdateddata": [0, 65], "kvfactor": 0, "kvheadnum": 77, "kwarg": [15, 17, 65, 77, 78, 79, 82, 88, 90], "kxgrammar": 0, "l": [10, 50, 51, 52, 68, 86], "l2": 25, "l20": 25, "l304": 24, "l345": 24, "l4": 25, "l40": 25, "l440": 24, "l506": 24, "l546": 24, "l823": 24, "lab": 68, "label": [7, 77, 78, 79], "labelembed": 78, "lack": 0, "lambda": [0, 3], "lamportinitializeal": 1, "languag": [0, 6, 10, 14, 16, 19, 24, 66, 67, 77, 85, 86, 88, 91], "language_adapt": [82, 88], "language_adapter_config": 82, "language_adapter_rout": [79, 82], "language_adapter_uid": 82, "language_model": 15, "languageadapterconfig": 82, "languageadapteruid": 0, "larg": [5, 8, 10, 14, 16, 17, 18, 19, 23, 24, 25, 30, 47, 56, 66, 67, 68, 71, 72, 74, 75, 77, 84, 86, 87, 88, 91], "larger": [0, 2, 5, 6, 8, 10, 11, 18, 20, 21, 23, 49, 65, 68, 69, 77, 82, 84, 88], "largest": [6, 19, 20, 21, 77], "last": [0, 1, 3, 5, 9, 10, 12, 24, 65, 74, 76, 77, 79], "last_lay": 82, "last_process_for_ub": 77, "last_token_id": [77, 79, 87], "last_token_ids_for_logit": 79, "last_tokens_id": 77, "lastdraftindic": 1, "lastdraftlen": 1, "lastdraftpath": 1, "lastdrafttoken": 1, "lastgenerationlength": 1, "lastit": 0, "lastpositionidsbas": 1, "lasttokentim": 0, "late": 53, "latenc": [0, 5, 8, 10, 20, 21, 23, 25, 59, 69, 74, 75, 76, 77, 88], "latent": [78, 79], "later": [0, 1, 6, 9, 10, 14, 17, 21, 41, 44, 64, 72, 75, 82, 84, 87, 89], "latest": [0, 27, 60, 83, 88], "latter": [3, 23, 88], "launch": [2, 8, 14, 26, 47, 50, 51, 52, 59, 62, 64, 70, 87, 88, 89], "launch_llama_3": 14, "layer": [0, 1, 2, 4, 5, 6, 7, 9, 10, 12, 13, 14, 15, 25, 65, 71, 77, 82, 83, 84, 85, 87, 88, 90, 92, 93], "layer1": 9, "layer_idx": [9, 12, 77, 82, 90, 92], "layer_names_onli": [25, 65], "layer_norm": [77, 78], "layer_quant_mod": 65, "layer_typ": 82, "layerid": [1, 9], "layeridx": 1, "layernorm": [12, 25, 75, 77, 78, 88], "layernorm_shar": 78, "layernorm_typ": 78, "layernormpositiontyp": 77, "layernormtyp": [77, 78], "layertyp": [1, 7], "layout": [74, 88], "lead": [7, 8, 10, 14, 25, 53, 60, 68, 69, 70, 72, 74, 75], "leader": [0, 82], "learn": [20, 21, 23, 38, 41, 42, 44, 45, 47, 72, 77, 83], "learned_absolut": [13, 77, 78, 79], "least": [3, 5, 17, 18, 26, 53, 74, 82], "leav": [54, 74, 75, 76], "left": [65, 69, 74, 76, 77], "legaci": [15, 76, 80, 88], "len": [1, 68, 77, 82, 94], "length": [0, 1, 5, 8, 18, 19, 20, 21, 22, 23, 24, 25, 26, 48, 65, 68, 69, 70, 73, 75, 76, 77, 82, 84, 87, 88, 92, 93], "length_penalti": [6, 65, 82], "lengthlengthpenalti": 6, "lengthpenalti": [0, 1, 6], "less": [0, 3, 5, 6, 14, 20, 65, 69, 77], "let": [7, 12, 13, 15, 24, 27, 32, 66, 68, 74, 77], "letter": 77, "level": [0, 1, 3, 5, 9, 12, 13, 15, 17, 25, 26, 44, 64, 67, 68, 84, 88, 90], "leverag": [10, 19, 24, 72, 83], "lf": [9, 18, 60, 64], "lfz941": 88, "lh": 1, "lib": [17, 62, 68], "libnvinfer_plugin_tensorrt_llm": 60, "libopenmpi": [61, 62], "librari": [14, 16, 60, 64, 66, 87, 88, 92], "libtensorrt_llm": 60, "licens": [64, 83], "life": 53, "lightweight": 5, "like": [0, 3, 5, 6, 7, 8, 10, 13, 14, 16, 17, 23, 24, 25, 32, 38, 40, 41, 42, 43, 44, 45, 46, 47, 49, 50, 51, 52, 53, 54, 65, 66, 68, 70, 71, 72, 74, 75, 76, 77, 83, 84, 85, 87, 88, 89, 90, 91, 93], "likelihood": [4, 8, 10], "limit": [0, 2, 3, 5, 6, 7, 14, 17, 18, 23, 24, 32, 62, 64, 65, 66, 70, 74, 76, 77, 80, 82, 84, 86, 92], "lin": 19, "line": [8, 18, 23, 68, 70, 72, 75, 84, 88, 93, 94], "linear": [1, 9, 10, 12, 13, 14, 77, 84, 85, 88, 90, 92], "linearactiv": 78, "linearapproximategelu": 78, "linearbas": 78, "lineargeglu": 78, "lineargelu": 78, "linearli": 84, "linearswiglu": 78, "link": [8, 18, 24, 27, 28, 88], "linspac": 77, "linux": [59, 86, 88], "linux_x86_64": 60, "list": [0, 1, 3, 5, 6, 7, 13, 14, 15, 16, 32, 47, 60, 63, 65, 66, 68, 69, 70, 77, 78, 79, 82, 86, 87, 88, 92, 93, 94], "list_siz": 78, "liter": 65, "littl": 75, "live": 84, "livecodebench": 24, "lkm2835": 88, "ll": [23, 26], "llama": [6, 9, 10, 11, 15, 17, 20, 21, 23, 25, 41, 49, 64, 70, 71, 73, 74, 76, 83, 85, 86, 88, 89, 90], "llama2": [5, 9, 19, 20, 88], "llama3": 77, "llama4forconditionalgener": 86, "llama_13b": 21, "llama_70b": 21, "llama_7b": [9, 11], "llama_7b_with_lora_qkv": 9, "llama_model_path": 32, "llamaconfig": [79, 90], "llamaforcausallm": [15, 17, 79, 86], "llamamodel": 79, "llava": [15, 85, 86, 88], "llava_dict": 15, "llavallamamodel": 86, "llavanextforconditionalgener": 86, "llavanextvisionconfig": 79, "llavanextvisionwrapp": 79, "llm": [0, 2, 3, 5, 6, 7, 8, 9, 12, 14, 19, 22, 24, 25, 26, 29, 30, 31, 33, 34, 35, 39, 40, 41, 42, 43, 44, 46, 47, 48, 49, 53, 54, 55, 56, 57, 61, 62, 63, 65, 67, 69, 71, 72, 73, 75, 76, 77, 79, 81, 82, 85, 87, 89, 90, 91, 92, 93, 94], "llm_arg": [65, 69], "llm_engine_dir": 82, "llm_inference_distribut": 64, "llm_kwarg": [39, 49], "llm_mgmn_": 88, "llm_option": 69, "llm_ptq": 89, "llmapi": [3, 26, 32, 39, 40, 44, 46, 48, 49, 50, 51, 52, 54, 65, 69, 72, 88], "llmarg": [65, 69, 88], "llmrequest": [1, 93, 94], "llmrequestptr": 1, "llmrequestst": 94, "lm": 10, "lm_head": [12, 15, 49, 68, 88], "lmm": [6, 68], "lmsy": [39, 49], "ln_emb": 15, "ln_f": [12, 15], "load": [0, 1, 9, 12, 13, 14, 17, 22, 24, 25, 41, 44, 49, 62, 64, 65, 68, 69, 70, 75, 76, 79, 81, 82, 83, 84, 88], "load_format": 65, "load_model_on_cpu": 79, "load_tensor": 15, "load_test_audio": 82, "load_test_data": 82, "load_weight": 90, "loaded_weight": 78, "loader": 88, "loadinprogress": 1, "loadweight": 1, "local": [13, 14, 18, 24, 25, 41, 42, 43, 44, 45, 50, 51, 52, 54, 60, 62, 65, 68, 69, 72, 88, 93], "local_in_featur": 78, "local_layer_idx": 78, "local_model": [50, 51, 52], "local_out_featur": 78, "local_us": [18, 60, 83], "localhost": [26, 29, 30, 31, 33, 34, 35, 55, 56, 57, 83], "localinadapters": 1, "localindim": 1, "localinouts": 1, "localins": 1, "localoutadapters": 1, "localoutdim": 1, "localouts": 1, "localreduct": 24, "localscaless": 1, "localtotals": 1, "locat": [6, 7, 14, 54, 60, 68, 69, 77, 83, 87, 92], "locate_accepted_draft_token": 82, "lock": [62, 68], "lockstep": 0, "log": [0, 1, 5, 25, 26, 27, 50, 51, 52, 54, 65, 68, 77, 83, 84, 88], "log_level": [25, 26], "log_softmax": 77, "logic": [3, 15, 17, 47, 78, 88, 90, 91, 94], "login": [27, 83], "logit": [0, 1, 6, 10, 24, 36, 37, 65, 68, 77, 79, 82, 87, 88], "logits_dtyp": [13, 25, 79], "logits_processor": [47, 65, 82], "logits_processor_map": 82, "logits_processor_nam": 82, "logitspostprocessor": 0, "logitspostprocessorbatch": [0, 3], "logitspostprocessorconfig": [0, 3, 88], "logitspostprocessormap": 0, "logitspostprocessornam": 0, "logitsprocessor": [47, 65, 82, 88], "logitsprocessorlist": 82, "logitsvec": 1, "logn": [77, 88], "logn_scal": 77, "logprob": [0, 1, 32, 48, 65, 83], "logprobs_diff": 65, "logprobscba": 1, "logprobstil": 1, "london": 87, "long": [5, 23, 25, 67, 68, 70, 71, 72, 74, 75, 84, 88], "long_mscal": [77, 78], "long_rop": 77, "long_rope_embed_posit": 78, "long_rope_embed_positions_for_gpt_attent": 78, "long_rope_rotary_cos_sin": 77, "long_rope_rotary_inv_freq": [77, 78], "longer": [0, 6, 8, 24, 65, 69, 74, 77, 94], "longest": [2, 74, 77], "longrop": 77, "longtensor": [82, 90], "look": [0, 3, 17, 22, 60, 66, 68, 88], "lookahead": [0, 1, 36, 37, 59, 65, 88], "lookahead_config": [48, 65, 82], "lookahead_decod": [25, 79], "lookaheadconfig": 0, "lookaheaddecod": 1, "lookaheaddecodingbuff": 1, "lookaheaddecodingconfig": [0, 1, 48, 65], "lookaheadinput": 1, "lookaheadoutput": 1, "lookaheadruntimebuff": 1, "lookaheadruntimeconfig": 1, "lookup": [59, 77, 78, 88], "lookup_plugin": 77, "loop": [0, 3, 6, 14, 15, 76], "lopuhin": 88, "lora": [0, 1, 3, 36, 37, 59, 63, 65, 77, 78, 79, 82, 88], "lora_ckpt_sourc": [25, 82], "lora_config": [53, 65, 79], "lora_dir": [9, 25, 53, 82], "lora_dir1": 53, "lora_dir2": 53, "lora_dir3": 53, "lora_hidden_st": 78, "lora_layer_param": 78, "lora_manag": [53, 65, 82, 88], "lora_param": 79, "lora_plugin": [9, 25, 77, 82], "lora_rank": [9, 77], "lora_request": [53, 65], "lora_runtime_param": 78, "lora_target_modul": [9, 25, 79, 82], "lora_task_uid": 82, "lora_uid": 82, "lora_weights_point": 77, "loracachefullexcept": 1, "loracachepagemanag": 1, "loraconfig": [0, 9, 53, 65, 79, 88], "loraexpectedexcept": 1, "loraid": 0, "loramanag": 82, "loramodulenam": 1, "loraparam": 79, "loraprefetchdir": 0, "lorarequest": [53, 65], "loraruntimeparam": 78, "lorataskidtyp": [0, 1], "loraweight": 9, "loss": [23, 72], "lot": [5, 8, 14, 16], "loudspeak": 21, "lovelac": [66, 86, 88], "low": [5, 12, 17, 18, 23, 24, 25, 59, 77, 88], "low_latency_gemm": 77, "low_latency_gemm_plugin": [25, 68, 72, 78], "low_latency_gemm_swiglu": 77, "low_latency_gemm_swiglu_plugin": [25, 72, 80], "low_rank": 77, "lower": [0, 1, 2, 6, 7, 8, 9, 22, 23, 44, 65, 69, 72, 77, 84], "lru": [1, 8, 77], "lt": 77, "luotuo": 9, "m": [0, 18, 20, 24, 26, 34, 35, 40, 53, 68, 69, 70, 72, 74, 75, 77, 84, 85], "macceptancethreshold": 0, "machin": [8, 18, 23, 47, 88], "madditionalmodeloutput": 0, "made": [47, 66, 88, 94], "mahmoudashraf97": 88, "mai": [0, 1, 2, 3, 5, 8, 9, 10, 13, 14, 15, 17, 18, 24, 25, 27, 50, 51, 52, 60, 62, 64, 67, 68, 69, 70, 75, 76, 77, 78, 80, 84, 87, 88, 90, 91, 92, 93], "main": [3, 6, 19, 22, 24, 30, 32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 53, 54, 56, 61, 62, 64, 65, 67, 70, 72, 75, 76, 77, 83, 84, 87, 89, 90], "maintain": [2, 9, 19, 20, 23, 68, 72, 85], "major": [17, 24, 54, 66, 69, 84], "make": [1, 2, 5, 7, 9, 14, 17, 18, 23, 24, 27, 28, 48, 53, 59, 60, 66, 68, 70, 76, 77, 83, 87, 88], "make_causal_mask": 78, "makeshap": 1, "mallotedtim": 0, "mallreducecommptr": 1, "mamba": [25, 64, 77, 85, 86, 88], "mamba1": 77, "mamba2": [77, 88], "mamba_conv1d": 77, "mamba_conv1d_plugin": [25, 82], "mamba_vers": 77, "mambaconfig": 79, "mambaforcausallm": 79, "manag": [0, 1, 2, 5, 10, 14, 25, 32, 62, 64, 70, 76, 80, 82, 83, 84, 88, 89, 91, 92], "managedweight": 0, "managedweightsmap": 1, "manageweightstyp": 1, "manageweighttyp": 1, "mandatori": [1, 3, 13], "mani": [0, 5, 8, 10, 14, 17, 25, 28, 54, 65, 69, 72, 74, 76, 77, 86, 87], "manipul": 7, "manner": 7, "mantissa": 20, "manual": [32, 65, 82, 87], "manufactur": 68, "map": [0, 1, 2, 3, 5, 7, 12, 13, 14, 15, 17, 24, 69, 77, 78, 79, 82, 83, 93], "marcellu": 46, "mard1no": 88, "margin": [68, 74], "mark": [1, 7, 74, 77, 87], "mark_as_remov": 7, "mark_output": [3, 77], "markalldon": 1, "markdon": 1, "marks101": 88, "marktaskdon": 1, "mask": [0, 1, 5, 10, 24, 47, 77, 78, 79, 82, 92], "mask_typ": 77, "masked_scatt": 77, "masked_scatter_": 77, "masked_select": [77, 88], "massiv": 18, "master": [71, 72, 73], "mat2": 77, "match": [0, 4, 7, 10, 59, 65, 68, 77, 78, 82, 83, 87, 88], "match_and_rewrit": 7, "materi": 3, "math": [24, 86], "matichon": 88, "matmul": [5, 14, 25, 72, 77, 85], "matric": 4, "matrix": [5, 14, 22, 59, 66, 68, 71, 77, 83, 92], "mattentionconfig": 0, "mattentiontyp": 0, "matter": 8, "matur": 26, "max": [0, 1, 9, 19, 20, 21, 59, 65, 70, 72, 73, 75, 77, 82, 84, 87, 92], "max_all_reduce_block": 1, "max_attention_window": [65, 76, 88], "max_attention_window_s": [5, 76, 77, 82], "max_attn_valu": 78, "max_batch_s": [5, 9, 11, 13, 14, 17, 18, 25, 26, 32, 33, 39, 44, 48, 49, 65, 68, 72, 74, 75, 77, 79, 82, 84, 87, 88, 93], "max_beam_width": [3, 5, 25, 26, 32, 44, 65, 77, 79, 82, 84], "max_block": [77, 94], "max_blocks_per_seq": 82, "max_blocks_per_sequ": 77, "max_boost_slid": 68, "max_cache_storage_gb": 65, "max_context_length": [77, 78, 82, 84], "max_decoder_input_len": 79, "max_decoder_seq_len": 25, "max_dist": [5, 77, 78], "max_draft_len": [25, 39, 49, 65, 79, 81], "max_draft_token": [79, 82], "max_encoder_input_len": [25, 65, 79], "max_gen_token": 79, "max_input_len": [9, 11, 13, 14, 25, 65, 68, 79, 82, 84], "max_input_length": [77, 78, 79, 82], "max_kv_seqlen": 77, "max_lora_rank": [9, 25, 53], "max_low_rank": 77, "max_medusa_token": 82, "max_multimodal_len": 25, "max_new_token": [82, 84], "max_ngram_s": [48, 65], "max_non_leaves_per_lay": [39, 65], "max_num_request": [92, 93, 94], "max_num_token": [18, 25, 26, 32, 33, 44, 65, 68, 72, 74, 75, 79, 84, 88, 92], "max_output_len": [14, 82, 83, 87, 88], "max_period": 78, "max_position_embed": [13, 77, 78, 79], "max_position_embedding_len": 77, "max_power_limit": 68, "max_prompt_adapter_token": 65, "max_prompt_embedding_table_s": [25, 65, 82, 88], "max_record": 65, "max_seq_len": [9, 11, 13, 14, 25, 26, 39, 49, 65, 68, 76, 77, 78, 79, 82, 84, 88, 93], "max_seqlen": [5, 77], "max_seqlen_for_logn_sc": 78, "max_sequence_length": [5, 82], "max_token": [26, 29, 30, 31, 40, 46, 55, 56, 57, 65, 76, 83, 89], "max_tokens_in_paged_kv_cach": [76, 82, 88], "max_util": [0, 65, 76], "max_verification_set_s": [48, 65], "max_window_s": [48, 65], "maxaccepteddrafttokensperstep": 1, "maxacceptedtoken": 1, "maxadapters": 0, "maxattentionwindow": 1, "maxattentionwindowvec": [0, 1], "maxbadwordslen": 1, "maxbatchs": [0, 1, 6], "maxbatchsizeruntim": 0, "maxbatchsizeruntimeupperbound": 0, "maxbatchsizestat": 0, "maxbatchsizetunerrecommend": 0, "maxbeamwidth": [0, 1, 3, 88], "maxdecoderstep": 1, "maxdecodingdrafttoken": 1, "maxdecodingtoken": [0, 1], "maxdraftpathlen": [0, 1], "maxdrafttoken": [0, 1], "maxencoderlen": 1, "maxgenerationlength": 1, "maxgenlengthdevic": 1, "maxgenlengthhost": 1, "maxgentoken": 1, "maxim": [0, 19, 21, 24, 68, 76], "maximum": [0, 1, 2, 3, 5, 6, 18, 21, 25, 26, 65, 68, 69, 72, 77, 78, 82, 84, 87, 88, 93], "maxinputlen": [1, 6], "maxinputlength": 1, "maxlength": 1, "maxlengthstop": 0, "maxlorarank": 1, "maxmedusahead": 1, "maxnewtoken": [1, 88], "maxnonleafnodesperlay": 1, "maxnumactiverequest": 0, "maxnumblock": 0, "maxnumpath": 1, "maxnumsequ": [1, 88], "maxnumtoken": [0, 1], "maxnumtokensruntim": 0, "maxnumtokensstat": 0, "maxnumtokenstunerrecommend": 0, "maxoutputlength": 3, "maxpagesperblock": 1, "maxpagesperblockdevic": 0, "maxpagesperblockhost": 0, "maxpathdraftlen": 1, "maxpathlen": [0, 1], "maxpositionembed": [0, 1], "maxpromptembeddingtables": 1, "maxqueues": 0, "maxseqidlemicrosecond": 0, "maxseqlen": 1, "maxsequencelen": [1, 6], "maxsequencelength": 1, "maxstopwordslen": 1, "maxtoken": [0, 84, 88], "maxtokensperenginestep": 1, "maxtokensperstep": 1, "mb": 84, "mbackend": 0, "mbart": [86, 88], "mbatchingtyp": 0, "mbatchsizet": 0, "mbeamsearchbuff": 1, "mbeamsearchdiversityr": 0, "mbeamwidth": 0, "mbeamwidtharrai": 0, "mbp": 40, "mbuffer": 1, "mbuffermanag": 1, "mc_handl": 1, "mc_ptr": 1, "mc_va": 1, "mcachemap": 1, "mcachemutex": 1, "mcachepagemanag": 1, "mcachest": 0, "mcachetransceiverconfig": 0, "mcapacityschedulerpolici": 0, "mcommmod": 0, "mcommptr": 1, "mcommstat": 0, "mcommtyp": 0, "mcomputecontextlogit": 1, "mcomputegenerationlogit": 1, "mconfig": [0, 1], "mcontextchunkingpolici": 0, "mcontextfmha": 1, "mcontextparallel": 1, "mcopyonpartialreus": 0, "mcpu": 1, "mcpudiff": 1, "mcrosskvcachefract": 0, "mcudagraphcaches": 0, "mcudagraphmod": 0, "mcumlogprobstmp": 1, "md": [2, 10, 12, 24, 77, 88, 91], "mdatatyp": [0, 1], "mdebugconfig": 0, "mdebuginputtensor": 0, "mdebugoutputtensor": 0, "mdebugtensornam": 0, "mdebugtensorsmaxiter": 0, "mdecod": 1, "mdecodedurationm": 0, "mdecoderetentionprior": 0, "mdecoderst": 1, "mdecoderstream": 1, "mdecodingconfig": 0, "mdecodinglayerworkspac": 1, "mdecodingmod": [0, 1], "mdefaulteaglechoic": 1, "mdefaultmedusachoic": 1, "mdefaultposteriorthreshold": 1, "mdevic": 1, "mdevicebuffermanag": 1, "mdevicecacheperc": 0, "mdeviceid": [0, 1], "mdogreedysampl": 1, "mdonetask": 1, "mdprank": 0, "mdpsize": 0, "mdrafttoken": 0, "mdynamicbatchconfig": 0, "mdynamicbatchmovingaveragewindow": 0, "mdynamicdecodelay": 1, "mdynamictreemaxtopk": 0, "me": [30, 53, 54, 56, 83], "meaglechoic": 0, "meagleconfig": 0, "mean": [1, 4, 5, 6, 8, 10, 13, 15, 17, 18, 20, 21, 26, 34, 35, 51, 53, 65, 67, 68, 69, 70, 71, 76, 77, 80, 82, 84], "meaning": [1, 72, 75], "meant": 73, "mearlystop": 0, "measur": [0, 19, 21, 22, 23, 59, 68, 70, 88], "mechan": [3, 14, 93, 94], "media": [68, 88], "media_path": 68, "medium": [23, 87, 88], "medusa": [0, 1, 25, 36, 37, 59, 65, 77, 79, 82, 88], "medusa_choic": [10, 49, 65, 68, 82], "medusa_decode_and_verifi": 82, "medusa_hidden_act": 81, "medusa_logit": 82, "medusa_model_dir": 81, "medusa_output_token": 82, "medusa_path": 82, "medusa_position_offset": 82, "medusa_temperatur": [10, 82], "medusa_topk": 82, "medusa_tree_id": 82, "medusachoic": [0, 1], "medusaconfig": 79, "medusacurtokensperstep": 1, "medusadecodingconfig": [49, 65], "medusaforcausallm": 79, "medusainput": 1, "medusalogit": 1, "medusapath": 1, "medusatargettokensperstep": 1, "medusatreeid": 1, "meet": [23, 77], "membeddingt": 0, "member": [0, 1, 6, 7, 11, 14, 54, 77], "memlock": [60, 87], "memori": [0, 1, 2, 4, 5, 6, 9, 14, 15, 17, 19, 20, 22, 23, 24, 25, 26, 32, 47, 59, 65, 68, 69, 70, 74, 75, 77, 82, 87, 88, 92, 93], "memorypoolfre": [1, 84], "memorypoolreserv": [1, 84], "memorypooltrimto": 1, "memorypoolus": 1, "memorytyp": [0, 1], "memorytypestr": 1, "memtyp": 1, "memusagechang": 84, "menableattentiondp": [0, 1], "menablebatchsizetun": 0, "menableblockreus": 0, "menablechunkedcontext": 0, "menablecontextfmhafp32acc": 0, "menablemaxnumtokenstun": 0, "menablepartialreus": 0, "menabletrtoverlap": 0, "mencodedvocab": 0, "mencoderhiddens": 1, "mengineaddr": 1, "menginebuff": 1, "menginepath": 1, "mengines": 1, "mental": 53, "mention": [6, 17, 18, 32, 72], "menu": [27, 28], "merg": [24, 77], "meshgrid": 77, "meshgrid2d": 77, "messag": [24, 26, 29, 30, 55, 56, 62, 69, 77, 83, 84, 88], "met": [0, 1, 3, 10], "meta": [17, 64, 65, 68, 69, 70, 76, 83, 86], "meta_ckpt_dir": 79, "metadata": [90, 92], "metal": [88, 89], "meth": 64, "method": [0, 1, 3, 5, 6, 10, 11, 13, 14, 17, 19, 32, 47, 62, 68, 82, 85, 87, 88, 90, 91, 93, 94], "metric": [0, 65, 67, 68, 69, 70, 72, 74, 75, 88], "mevent": 1, "meventbuffermaxs": 0, "mexecutionconfig": 1, "mextendedruntimeperfknobconfig": 0, "mfastlogit": 0, "mfinishedstep": 1, "mfirstgentoken": 0, "mfreegpumemoryfract": 0, "mfreepageid": 1, "mfrequencypenalti": 0, "mfuntowicz": 88, "mgathergenerationlogit": 0, "mgemmallreducedtyp": 1, "mgmn": [36, 37], "mgpu": 1, "mgpudiff": 1, "mgpuspernod": 1, "mgpuweightsperc": 0, "mgreedysampl": 0, "mguid": 0, "mguideddecodingconfig": 0, "mguidetyp": 0, "mh": 10, "mh1": 10, "mha": [5, 19, 25, 77, 82, 92], "mhiddens": 1, "mhostcaches": 0, "mi": 85, "mib": 84, "micro": [0, 84], "microbatchid": 0, "microbatchschedul": [91, 94], "microsecond": 0, "microsoft": 13, "middl": 67, "might": [0, 3, 14, 17, 18, 23, 25, 60, 64, 66, 68, 70, 71, 75, 82, 84, 87, 88, 93], "migrat": [17, 80, 88], "million": [54, 68], "millisecond": 0, "millisecondstyp": 0, "mimpl": 0, "min": [0, 1, 6, 20, 24, 68, 70, 75, 77, 87], "min_lat": 77, "min_length": [6, 82], "min_p": [0, 6, 65, 82], "min_token": 65, "mind": [23, 76], "mindim": 1, "mindimfirst": 1, "mini": 88, "minim": [24, 74, 83], "minimum": [0, 5, 6, 65, 68, 69, 72, 77, 82, 84], "minitron": [86, 88], "minittozero": 1, "minlength": [1, 6, 88], "minnormedscorescba": 1, "minor": [54, 88], "minp": [0, 1, 6], "minprogresstask": 1, "minputpack": 1, "minputtokenextraid": 0, "mintoken": [0, 88], "mintpsplitdim": 1, "minut": [0, 23, 70], "mip": 0, "mipcmemoryhandl": 1, "mirco": 0, "mish": 78, "mismatch": [17, 62, 87], "misorchestr": 0, "mispagefre": 1, "miss": [0, 7, 18, 68, 88], "missedblock": 0, "missedblocksperrequest": 0, "mission": 24, "mistral": [4, 64, 68, 72, 75, 85, 86, 88], "mistralai": [68, 86], "mistralforcausallm": 86, "misus": 88, "miterstatsmaxiter": 0, "mitig": [17, 24], "mix": [2, 71, 75, 88], "mixed_precis": 65, "mixer": 88, "mixtral": [4, 9, 64, 68, 72, 75, 85, 86, 88], "mixtralforcausallm": 86, "mixtur": [59, 75, 88], "mjointdecodinginput": 1, "mjointdecodingoutput": 1, "mkdir": 27, "mkdtemp": [41, 44], "mkvcacheconfig": 0, "mkvcachetyp": 1, "mkvfactor": 0, "ml": [77, 88], "mla": [24, 77, 88], "mlayertyp": 1, "mlengthpenalti": 0, "mllama": [86, 88], "mllamaconfig": 79, "mllamaforcausallm": 79, "mllamaforconditionalgener": 86, "mlogit": 0, "mlogitsdtyp": 1, "mlogitspostprocessorconfig": 0, "mlookaheaddecodingconfig": 0, "mlookaheaddecodingmaxnumrequest": 0, "mloramodul": 1, "mloraprefetchdir": 0, "mlp": [9, 12, 14, 15, 25, 77, 87, 88, 90], "mlp_4h_to_h": [9, 25], "mlp_bia": 79, "mlp_gate": [9, 25], "mlp_gate_up": [9, 25], "mlp_h_to_4h": [9, 25], "mlp_output": 87, "mlp_router": [9, 25], "mlphiddens": 1, "mlptype": 77, "mm": 88, "mm_data": 68, "mm_embedding_offload": 82, "mma": 77, "mmanag": 1, "mmanagedweightsmap": 1, "mmanageweightstyp": 1, "mmaxadapters": 0, "mmaxattentionwindow": 0, "mmaxattentionwindowvec": 0, "mmaxbatchs": [0, 1], "mmaxbeamwidth": [0, 1], "mmaxdecodingdecodertoken": 1, "mmaxdecodingdrafttoken": 1, "mmaxdecodingenginetoken": 1, "mmaxdraftpathlen": 1, "mmaxencoderlen": 1, "mmaxinputlen": 1, "mmaxlorarank": 1, "mmaxnonleafnodesperlay": 1, "mmaxnumpackedmask": 1, "mmaxnumpath": 1, "mmaxnumtoken": [0, 1], "mmaxpagesperblock": 1, "mmaxpagesperblockdevic": 0, "mmaxpagesperblockhost": 0, "mmaxpositionembed": 1, "mmaxpromptembeddingtables": 1, "mmaxqueues": 0, "mmaxseqidlemicrosecond": 0, "mmaxsequencelen": 1, "mmaxsequencelength": 1, "mmaxtoken": 0, "mmedusachoic": 0, "mmemorytyp": 1, "mmha": [77, 88], "mminp": 0, "mmintoken": 0, "mmlphiddens": 1, "mmlu": [23, 24, 88], "mmlu_llmapi": 88, "mmmu": 68, "mmodelconfig": [0, 1], "mmodelnam": 1, "mmodelvari": 1, "mmoduleidtomodul": 1, "mmropepositiondelta": 0, "mmroperotarycossin": 0, "mmultiblockmod": 0, "mname": 1, "mnbattentionlay": 1, "mnbhead": 1, "mnbkvheadsperlay": 0, "mnblayer": 1, "mnbrnnlayer": 1, "mngramsiz": 0, "mnorepeatngrams": 0, "mnormalizelogprob": 0, "mnumcopystream": [0, 1], "mnumdecodingenginetoken": 1, "mnumdevicemodulelay": 0, "mnumensurework": 0, "mnumhostmodulelay": 0, "mnumkvheadsperattentionlay": 1, "mnumkvheadspercrossattentionlay": 1, "mnumlanguag": 1, "mnumnod": 0, "mnumputwork": 0, "mnumreturnbeam": 0, "mnumreturnsequ": 0, "mnumsm": 1, "mnumtransformerslay": 1, "modal": 85, "mode": [0, 1, 4, 5, 7, 14, 15, 25, 26, 40, 50, 51, 52, 65, 76, 77, 78, 82, 84, 85, 88, 90], "model": [0, 1, 2, 3, 4, 5, 8, 9, 11, 13, 17, 19, 20, 21, 22, 23, 25, 26, 29, 30, 31, 32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 61, 62, 65, 66, 67, 70, 73, 76, 77, 78, 80, 81, 82, 84, 85, 89, 92, 93, 94], "model_architectur": 65, "model_cl": 78, "model_cls_fil": 25, "model_cls_nam": 25, "model_config": [25, 65, 82, 90], "model_dir": [9, 11, 12, 13, 14, 15, 17, 49, 50, 68, 71, 79, 81, 83, 87], "model_engin": 93, "model_nam": [51, 69, 82], "model_path": [11, 51, 67, 68], "model_qu": 68, "model_weights_load": [15, 88], "modelconfig": [0, 6, 82, 88, 90], "modelengin": [91, 93], "modelidtomodel": 1, "modeling_deepseekv3": 24, "modeling_llama": 90, "modeling_mymodel": 90, "modeling_opt": 90, "modeling_util": [65, 90], "modelnam": 1, "modelopt": [13, 17, 49, 62, 68, 69, 81, 88], "modelopt_cuda_ext": 62, "modelpath": 0, "modelrunn": [13, 82, 88], "modelrunnercpp": [82, 88], "modelrunnermixin": 82, "modeltyp": [0, 11], "modelvari": 1, "modelweightsformat": 15, "modelweightsload": [15, 88], "modern": 82, "modif": [7, 14], "modifi": [3, 7, 60, 68, 72, 75, 76, 87, 88], "modul": [0, 1, 5, 6, 12, 13, 14, 15, 24, 25, 59, 60, 65, 75, 77, 78, 79, 81, 82, 87, 88, 90], "modular": 66, "module1": 24, "module10": 24, "module11": 24, "module12": 24, "module13": 24, "module2": 24, "module3": 24, "module4": 24, "module5": 24, "module6": 24, "module7": 24, "module8": 24, "module9": 24, "module_id": 9, "moduleid": [1, 9], "moduleidtomodel": 1, "modulelist": 90, "moduletyp": 1, "modulo": 77, "moe": [9, 15, 24, 25, 45, 59, 65, 75, 77, 79, 88], "moe_4h_to_h": [9, 25], "moe_allreduce_residual_rms_norm": 77, "moe_backend": 18, "moe_cluster_parallel_s": 65, "moe_ep_s": 4, "moe_expert_parallel_s": [45, 65], "moe_gat": [9, 25], "moe_h_to_4h": [9, 25], "moe_plugin": 25, "moe_rout": [9, 25], "moe_tensor_parallel_s": [45, 65], "moe_tp_siz": 4, "moeconfig": 79, "moetopk": 88, "moment": 3, "monboardblock": 0, "monitor": 25, "monitor_memori": [25, 65], "monolith": 5, "monost": 0, "month": 68, "mopenipc": 1, "moptimaladapters": 0, "morchestratorconfig": 0, "morchleadercomm": 0, "more": [0, 1, 2, 3, 4, 5, 6, 7, 10, 12, 13, 14, 19, 20, 21, 23, 24, 25, 26, 32, 36, 46, 47, 54, 60, 65, 66, 68, 69, 70, 72, 74, 75, 76, 77, 83, 84, 87, 88, 90, 92, 94], "most": [0, 1, 6, 10, 14, 17, 19, 20, 21, 23, 24, 38, 41, 42, 44, 45, 65, 67, 73, 75, 76, 77, 84, 87, 88], "mount": [26, 50, 51, 52], "mount_dest": [50, 51, 52], "mount_dir": [50, 51, 52], "moutdim": 1, "moutdimfirst": 1, "moutputbeamhypothes": 1, "mouttpsplitdim": 1, "move": [0, 1, 17, 47, 65, 66, 77, 87, 88], "movement": 14, "mownsev": 1, "mownsstream": 1, "mp4": [30, 56], "mpageblock": 1, "mpagedcontextfmha": 1, "mpagedst": 1, "mpagemanagerconfig": 1, "mpagesmutex": 1, "mpagewidth": 1, "mparallelconfig": 0, "mparticipantid": 0, "mpeftcacheconfig": 0, "mpi": [0, 1, 2, 6, 14, 16, 17, 25, 26, 50, 51, 52, 62, 65, 67, 68, 70, 77, 87, 88], "mpi4pi": [64, 70, 87, 88], "mpi_abort": 64, "mpi_barri": 17, "mpi_comm_world": [6, 64], "mpi_group_barri": 1, "mpicomm": 0, "mpicommsess": 65, "mpin": 1, "mpinneddiff": 1, "mpinnedpool": 1, "mpinnedpooldiff": 1, "mpipelineparallel": [0, 1], "mpirun": [13, 14, 64, 70, 87, 88], "mpisess": 65, "mpistat": 0, "mpointer": 1, "mpool": 1, "mport": 0, "mposteriorthreshold": 0, "mppreducescatt": 1, "mprecis": 1, "mpresencepenalti": 0, "mprocessorbatch": 0, "mprocessormap": 0, "mprompttableoffload": 0, "mpt": [23, 85, 86, 88], "mptforcausallm": 79, "mptmodel": 79, "mqa": [5, 19, 22, 24, 25, 77, 88, 92], "mquantmod": 1, "mrank": [0, 1], "mrecvpollperiodm": 0, "mrepetitionpenalti": 0, "mreplic": 0, "mreqid": 0, "mrequeststatsmaxiter": 0, "mrnnconfig": 1, "mrope": [0, 77], "mrope_param": [78, 82], "mrope_position_delta": [77, 78, 82], "mrope_rotary_cos_sin": [77, 78], "mrope_rotary_cos_sin_s": 79, "mropeconfig": 0, "mropeparam": [78, 82], "mropepositiondelta": 0, "mroperoratysinco": 0, "mrotaryembeddingdim": 1, "mruntimedefault": 1, "mruntimestream": 1, "msamplingconfig": 1, "mscale": 77, "mscale_all_dim": 77, "mschedulerconfig": 0, "msecondaryofflineminprior": [0, 65], "msecondaryoffloadminprior": 0, "mseed": 0, "mselfidx": 0, "msg": [0, 1, 24], "msinktokenlength": 0, "msizeperhead": [0, 1], "mskipcrossattnblock": 1, "msl": 1, "mslotsperpag": 1, "mspawnprocess": 0, "mspeculativedecodingconfig": 0, "mspeculativedecodingmod": 1, "mspeculativedecodingmodul": 1, "mstate": [0, 1], "mstoptokenid": 0, "mstream": 1, "mt5": 86, "mtag": 0, "mtaskid": 0, "mtemperatur": 0, "mtensor": 0, "mtensorparallel": [0, 1], "mtoken": 0, "mtokenizerstr": 0, "mtokenrangeretentionconfig": 0, "mtokensperblock": [0, 1], "mtopk": 0, "mtopp": 0, "mtoppdecai": 0, "mtoppmin": 0, "mtoppresetid": 0, "mtotalnumpag": 1, "mtp": [18, 65, 88], "mtp3_autoregress": 24, "mtp3_top1": 24, "mtp3_top10": 24, "mtp3_top15": 24, "mtp3_vanilla": 24, "mtpdecodingconfig": 65, "mtprank": 1, "mtrimpool": 1, "mtype": 1, "much": [8, 14, 67, 69, 74, 84], "mul": 77, "multi": [0, 2, 3, 4, 6, 8, 9, 10, 13, 16, 17, 19, 25, 30, 50, 51, 52, 56, 59, 60, 64, 65, 70, 77, 79, 84, 85, 88, 92], "multi_block_mod": [5, 65, 82, 88], "multiblockmod": 0, "multidimension": 77, "multihead": [14, 19], "multimod": [0, 25, 58, 68, 82, 86, 88], "multimodalembed": 0, "multimodalmodelrunn": 82, "multinod": 71, "multinomi": 6, "multipl": [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 14, 15, 24, 25, 36, 37, 65, 66, 70, 71, 72, 74, 77, 78, 82, 83, 87, 88, 92], "multiple_profil": [25, 68, 72, 75, 88], "multipli": [5, 15, 77], "multiply_and_lora": 78, "multiply_collect": 78, "multiprocessor": 14, "munsign": 1, "musecrossattent": 1, "musedynamictre": 0, "musegemmallreduceplugin": 1, "musegptattentionplugin": 1, "musegpudirectstorag": 0, "museloraplugin": 1, "musemambaconv1dplugin": 1, "musemrop": 1, "musepositionembed": 1, "museshapeinfer": 1, "musetokentypeembed": 1, "must": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 14, 16, 25, 26, 28, 40, 65, 72, 77, 78, 80, 82, 85, 87], "mutabl": [0, 1], "mutablepageptr": 1, "mutex": 1, "mutual": [6, 85], "muvm": 1, "muvmdiff": 1, "mverificationsets": 0, "mversion": 1, "mvocabs": 1, "mvocabsizepad": 1, "mweight": 0, "mwindows": 0, "mworkerexecutablepath": 0, "mworldconfig": 1, "my": [1, 36, 38, 39, 41, 42, 43, 44, 45, 47, 49, 54, 61, 62, 68, 83, 89], "my_faster_on": 32, "my_model": 12, "my_profile_export": [26, 34, 35], "myattent": 90, "mybatchedlogitsprocessor": 47, "myconfig": 90, "mydecoderlay": [12, 90], "mylogitsprocessor": 47, "mymodel": [12, 90], "mymodelforcausallm": [12, 90], "n": [1, 2, 5, 9, 10, 13, 14, 26, 38, 40, 41, 42, 43, 44, 45, 47, 50, 51, 52, 53, 54, 64, 65, 68, 70, 74, 77, 78, 79, 84, 85, 87, 88], "n_worker": 65, "na": [68, 88], "naiv": 75, "naivepatternrewriter_replaceaddwithsub": 7, "name": [0, 1, 3, 6, 7, 9, 13, 14, 26, 27, 36, 38, 39, 41, 42, 43, 44, 45, 47, 49, 51, 54, 61, 62, 64, 65, 68, 69, 70, 77, 79, 80, 81, 82, 83, 87, 88, 89, 90], "named_network_output": 87, "named_paramet": 15, "namespac": [0, 1, 64, 79], "nation": 68, "nationwid": 68, "nativ": [17, 20, 88, 90], "native_quant_flow": 79, "natur": [17, 30, 56, 70], "naur": [0, 3, 65], "nb": 79, "nbattentionlay": [0, 1], "nbdim": 1, "nbhead": 1, "nbkvhead": [0, 1], "nbkvheadperlay": 0, "nblayer": 1, "nbrnnlayer": 1, "nccl": [14, 24, 25, 77, 87, 88], "nccl_p2p_level": 88, "nccl_plugin": 25, "ncclplugin": 14, "ncclrecv": 77, "ncclsend": 77, "nd": [68, 77], "ndarrai": [77, 78, 82], "ndim": 77, "nearest": 77, "nearli": [7, 20], "necess": 10, "necessari": [1, 4, 10, 24, 53, 72, 77, 88, 93], "necessarili": [1, 14, 84], "need": [1, 2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 24, 26, 27, 32, 36, 40, 45, 50, 51, 52, 53, 60, 61, 62, 64, 65, 66, 68, 69, 70, 71, 72, 74, 75, 76, 77, 79, 80, 82, 83, 84, 87, 88, 90, 91, 92, 93, 94], "needed_block": 94, "needsdecoderprologu": 1, "needskvcacherewind": 1, "neg": [1, 76, 77], "neglig": [8, 23, 74], "neither": [3, 77, 84], "nemo": [13, 16, 25, 66, 70, 82, 85, 86, 88], "nemo_ckpt_dir": 79, "nemo_prompt_convert": 82, "nemotron": [86, 88], "nemotron_na": 88, "nemotronforcausallm": 86, "nemotronna": [86, 88], "nemotronnasforcausallm": 86, "neox": [5, 6, 85, 86, 88], "nest": 7, "net": [8, 87], "net_guard": 7, "network": [3, 4, 5, 7, 14, 16, 17, 25, 40, 77, 83, 84, 85, 87, 88], "neural": [4, 7, 14, 83, 88], "neva": [86, 88], "never": [7, 68, 76], "new": [0, 1, 3, 5, 6, 7, 8, 9, 10, 11, 17, 20, 21, 24, 26, 27, 29, 31, 38, 41, 42, 43, 44, 45, 47, 55, 57, 59, 60, 64, 65, 66, 74, 75, 77, 82, 83, 88, 89, 91, 93], "new_decoder_architectur": [13, 79], "new_generated_id": 82, "new_input": 7, "new_out": 7, "new_shap": 77, "new_tensor": 77, "new_token": 82, "new_workflow": 88, "newactiverequestsqueuelatencym": [0, 26], "newer": [86, 88], "newest": 21, "newli": [0, 65, 74], "newsiz": 1, "newtoken": 1, "newtokensstep": 1, "newtokensvec": 1, "newvalu": 0, "next": [1, 9, 10, 14, 17, 20, 59, 60, 66, 71, 72, 74, 75, 76, 82, 84, 86, 88], "next_logit": 82, "next_medusa_input_id": 82, "next_medusa_logit": 82, "next_step_buff": 82, "next_step_tensor": 82, "nextdraftindic": 1, "nextdraftlen": 1, "nextdraftpath": 1, "nextdraftprob": 1, "nextdrafttoken": 1, "nextdrafttokenslen": 1, "nextflattoken": 1, "nextgenerationlength": 1, "nextpositionoffset": 1, "ngc": [61, 62, 83, 88, 89], "ngoanpv": 88, "ngram": [0, 6, 65], "ngramsiz": 0, "ngroup": 77, "nhead": 77, "nhere": 40, "ni": [40, 85], "nine": 83, "nj": 43, "njane": [38, 41, 42, 43, 44, 45, 47], "njason": 53, "nmh": 82, "nmt": [82, 86, 88], "nn": [77, 90], "no_quant": 65, "no_repeat_ngram_s": [6, 65, 82], "no_schedule_after_st": 94, "no_schedule_until_st": 94, "noauxtckernel": 24, "node": [0, 2, 6, 16, 25, 50, 51, 52, 59, 64, 65, 67, 70, 71, 77, 82, 85, 87, 88], "noexcept": [0, 1], "nomin": [38, 41, 42, 43, 44, 45], "non": [0, 2, 5, 11, 14, 17, 23, 24, 25, 47, 77, 87, 88], "non_block": 47, "non_gated_vers": 77, "none": [1, 6, 7, 12, 15, 17, 25, 26, 32, 46, 47, 48, 49, 53, 54, 65, 68, 70, 74, 77, 78, 79, 80, 81, 82, 87, 88, 90, 92], "nonetyp": [65, 82], "nonzero": 77, "nor": 84, "norepeatngrams": [0, 1, 6], "norm": [15, 18, 51, 67, 68, 69, 70, 77, 88, 90], "norm_before_bmm1": [78, 79], "norm_elementwise_affin": 78, "norm_ep": 78, "norm_epsilon": [13, 79], "norm_factor": 5, "norm_num_group": 78, "norm_pre_residual_weight": 77, "norm_quant_fus": 25, "norm_typ": 78, "norm_weight": 77, "normal": [0, 6, 8, 9, 11, 23, 24, 65, 68, 77, 84, 88], "normalize_log_prob": 65, "normalize_weight": 9, "normalized_shap": [77, 78], "normalizelogprob": [0, 1], "normedscorescba": 1, "north": [12, 14, 87], "northeastern": 83, "not_op": 77, "notabl": 23, "note": [1, 2, 7, 8, 9, 10, 14, 18, 21, 23, 24, 25, 28, 32, 46, 50, 51, 52, 54, 59, 60, 68, 69, 72, 74, 76, 77, 80, 82, 84, 85, 86, 87, 89, 90, 93], "notic": [46, 53], "notimplementederror": 17, "nougat": [85, 86, 88], "nour": 54, "now": [6, 10, 13, 15, 19, 24, 66, 68, 74, 80, 83, 84, 88], "np": 77, "npy": 82, "npytorch_backend_config": 26, "nsight": 59, "nsy": 67, "ntask": [14, 26, 50, 51, 52], "null": [1, 13, 68, 83], "nullopt": [0, 1], "nullptr": [0, 1], "num": [0, 1, 18, 49, 51, 59, 65, 67, 68, 69, 70, 72, 73, 75], "num_attention_head": [13, 77, 78, 79], "num_aud_token": 82, "num_beam": [6, 82], "num_beam_group": 6, "num_block": [82, 93], "num_blocks_per_cache_level": 46, "num_bucket": [77, 78], "num_channel": [78, 79], "num_class": 78, "num_context": 92, "num_ctx_token": 92, "num_draft_token": [0, 77, 82], "num_eagle_lay": [39, 65], "num_embed": 78, "num_experts_per_tok": 4, "num_gener": 92, "num_group": [77, 78], "num_head": [5, 15, 77, 82, 92], "num_hidden_lay": [13, 79, 90, 93], "num_imag": 82, "num_img_token": 82, "num_key_value_head": [13, 79, 93], "num_kv_head": [77, 78, 82, 92, 93], "num_kv_heads_origin": 77, "num_kv_heads_per_cross_attn_lay": 82, "num_kv_heads_per_lay": 82, "num_lay": [77, 78, 82, 93], "num_ln_in_parallel_attn": 79, "num_local_block": 78, "num_local_expert": 4, "num_lora_module_lay": 9, "num_lora_modules_lay": 9, "num_medusa_head": [49, 65, 79, 81, 82], "num_medusa_lay": [79, 81], "num_multimodal_token": 0, "num_nextn_predict_lay": [18, 65], "num_orig_po": 77, "num_po": 77, "num_postprocess_work": 26, "num_profil": 79, "num_q_head": 24, "num_request": [18, 68, 69], "num_return_sequ": [82, 88], "num_sampl": 67, "num_task": 78, "num_token": [5, 24, 77, 92], "num_tokens_per_block": [77, 93], "num_tokens_per_task": 78, "num_video": 82, "numactiverequest": 0, "numattentionhead": 1, "numavailablepag": 1, "numbeamscba": 1, "number": [0, 1, 2, 3, 4, 5, 6, 10, 14, 18, 22, 24, 25, 26, 47, 50, 51, 52, 65, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 82, 84, 85, 87, 88, 90, 92, 93], "numblockspercachelevel": 0, "numcompletedrequest": 0, "numcontextrequest": [0, 1], "numcopystream": [0, 1], "numctxsequ": 1, "numctxtoken": 0, "numdevicemodulelay": 0, "numdrafttoken": 1, "numdrafttokenshost": 1, "numeaglelay": 1, "numel": 82, "numensurework": 0, "numer": [6, 24, 59, 68, 83, 86], "numexpert": 1, "numgeneratedtoken": 0, "numgenrequest": 0, "numgensequ": 1, "numgentoken": 0, "numhead": 6, "numhostmodulelay": 0, "numkvattentionhead": 1, "numkvhead": 6, "numlanguag": 1, "numlay": 6, "nummissedblock": 0, "numnewactiverequest": 0, "numnewallocatedblock": 0, "numnewtokenscumsum": 88, "numnod": [0, 88], "numpag": 1, "numpausedrequest": 0, "numpi": [9, 77, 82], "numputwork": 0, "numqueuedrequest": [0, 88], "numreturnbeam": 0, "numreturnsequ": [0, 1, 3], "numreusedblock": 0, "numscheduledrequest": 0, "numsequ": 1, "numslot": 1, "numtoken": 1, "numtotalallocatedblock": 0, "numtransformerslay": 1, "nvcc": 18, "nvcr": 88, "nvfp4": [24, 25, 54, 59, 65, 68, 88, 89], "nvidia": [13, 14, 16, 17, 18, 19, 20, 21, 23, 25, 27, 29, 30, 31, 33, 34, 35, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 62, 66, 68, 69, 70, 75, 77, 83, 84, 86, 87, 88, 89], "nvila": [86, 88], "nvinfer1": [0, 1], "nvl": [1, 25, 88], "nvl36": 71, "nvl72": 71, "nvlink": [2, 6, 70, 71, 73, 88], "nvswitch": [14, 24], "nyou": 40, "o": [0, 1, 7, 9, 17, 22, 24, 50, 51, 52, 67, 87], "o_proj": 15, "oai": [30, 56], "obei": 87, "object": [0, 1, 3, 8, 12, 14, 15, 17, 32, 40, 65, 77, 78, 79, 80, 82, 83, 84, 91], "observ": [46, 69], "obtain": [2, 16, 69, 77], "occas": 87, "occasion": 88, "occup": [5, 84], "occupi": [23, 84], "occur": [6, 8, 93, 94], "odd": 47, "off": [8, 67, 72, 74, 75, 84, 88], "offer": [14, 16, 23, 24, 66, 92], "offic": 40, "officenetsecur": 40, "offici": [5, 18, 68], "offlin": [12, 21, 36, 68, 69, 88], "offload": [0, 11, 25, 59, 65, 88], "offset": [1, 77, 82, 85, 88], "offsetdim": 1, "ofitensor": 0, "often": [0, 3, 10, 19, 23, 24, 65, 71, 72, 77], "ok": 87, "okai": 46, "old": [7, 9, 87], "older": [8, 17, 60, 86], "oldest": 9, "oldvalu": 0, "omit": [1, 3, 17, 77], "ompi": [62, 87], "onboard": [0, 8, 65, 84], "onboard_block": 65, "onboardblock": 0, "onc": [0, 3, 5, 6, 7, 14, 16, 60, 64, 65, 72, 77, 84], "one": [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 13, 14, 15, 17, 19, 24, 25, 26, 27, 53, 64, 65, 68, 70, 71, 72, 75, 76, 77, 78, 80, 82, 84, 87, 88, 90, 94], "ones": [0, 9], "oneshot": [24, 77], "oneshotallreduc": 24, "oneshotar": 24, "onevis": [86, 88], "ongo": [17, 54], "onli": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 18, 23, 25, 26, 32, 47, 54, 59, 64, 65, 68, 69, 70, 71, 72, 74, 75, 76, 77, 78, 80, 82, 84, 86, 88, 91, 94], "onlin": [16, 21, 36], "only_cross_attent": 78, "onnx": [25, 77], "onnx__gathernd": 77, "onto": 6, "oom": [18, 19, 22, 84], "ootb": 88, "op": [1, 7, 77, 88], "op_and": 77, "op_or": 77, "op_xor": 77, "opaqu": 7, "opaque_st": 65, "open": [6, 19, 24, 54, 66, 67, 87, 88], "openai": [26, 58, 83, 88], "openipc": 1, "openmpi": 88, "opensora": 88, "openssh": 27, "oper": [0, 1, 3, 5, 6, 7, 10, 13, 14, 15, 24, 25, 47, 65, 68, 71, 72, 75, 77, 83, 84, 86, 88, 91, 92, 93], "opportun": 68, "opt": [3, 13, 23, 27, 77, 85, 86, 87, 88], "opt_batch_s": [65, 79], "opt_num_token": [25, 65, 79], "optforcausallm": [13, 79], "optim": [1, 2, 3, 6, 7, 10, 14, 16, 17, 19, 20, 21, 22, 23, 25, 41, 47, 49, 60, 64, 66, 68, 69, 71, 72, 73, 77, 83, 84, 86, 87, 88, 89, 91, 92, 93], "optimaladapters": [0, 1], "option": [0, 1, 3, 6, 7, 10, 12, 17, 20, 25, 26, 32, 47, 51, 53, 59, 62, 65, 67, 68, 69, 70, 71, 73, 74, 77, 80, 82, 84, 87, 88, 90, 92, 93], "optionalbufferptr": 1, "optionaltensorptr": 1, "optmodel": 79, "optvec": 1, "orchestr": [0, 2, 10, 87, 88], "orchestratorconfig": 0, "orchleadercomm": 0, "order": [0, 1, 2, 5, 15, 19, 65, 68, 69, 72, 76, 77, 78, 84], "org": [0, 1, 4, 9, 25, 61, 62, 77, 85], "organ": [66, 93], "origin": [5, 7, 9, 77, 88, 90], "original_max_position_embed": [77, 78], "originaltemperatur": 1, "oserror": 88, "osl": [19, 20, 21, 22, 24, 68, 69, 75], "ostream": [0, 1], "other": [0, 1, 2, 3, 4, 5, 6, 8, 10, 14, 15, 17, 19, 24, 25, 32, 44, 46, 50, 51, 52, 54, 60, 64, 65, 66, 69, 70, 71, 72, 74, 75, 76, 77, 80, 84, 87, 88, 92, 94], "other_audio_input": 82, "other_decoder_input": 82, "other_vision_input": 82, "othercach": 1, "otherwis": [0, 1, 3, 5, 6, 32, 65, 68, 77, 82, 87, 92], "our": [18, 23, 24, 38, 40, 41, 42, 44, 45, 68, 69, 72, 74, 75, 77, 86, 87, 88, 90], "out": [0, 1, 2, 9, 17, 19, 20, 21, 22, 24, 36, 50, 51, 52, 64, 67, 69, 72, 74, 75, 77, 83, 84, 88], "out_bia": 78, "out_channel": 78, "out_context_dim": 78, "out_dim": 78, "out_fatur": 13, "out_featur": [13, 14, 78], "out_hidden_s": 77, "out_of_tree_exampl": 90, "out_point": 77, "out_tp": [19, 22], "outdim": 1, "outdimfirst": 1, "outer": 77, "outlin": 67, "output": [0, 1, 2, 5, 6, 7, 8, 9, 10, 14, 18, 19, 20, 21, 22, 23, 24, 25, 26, 32, 34, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, 53, 54, 61, 62, 65, 67, 69, 70, 71, 72, 73, 75, 76, 77, 78, 82, 83, 87, 88, 89, 91, 92, 94], "output_cum_log_prob": 82, "output_dim": 78, "output_dir": [9, 11, 12, 13, 14, 17, 25, 68, 71, 79, 81, 83, 87], "output_dtyp": [77, 78], "output_generation_logit": 82, "output_id": 82, "output_log_prob": 82, "output_multiplier_scal": 79, "output_pad": [77, 78], "output_s": 78, "output_seqlen": [19, 22], "output_sequence_length": 82, "output_timing_cach": [25, 65], "output_token": 68, "outputbuff": 1, "outputconfig": [0, 3, 32, 88], "outputidscba": 1, "outputlen": 0, "outputlogprob": 1, "outputtokenid": [0, 3], "outsid": [10, 16, 17, 92], "outsiz": 1, "outtpsplitdim": 1, "outweigh": 71, "over": [0, 1, 8, 10, 15, 18, 20, 21, 23, 24, 28, 67, 68, 71, 74, 75, 77, 88], "overal": [3, 5, 8, 10, 18, 66, 71, 72, 74, 75, 76, 90], "overcom": [5, 14, 24], "overflow": 1, "overhead": [3, 14, 24, 71, 88, 92], "overiew": 68, "overlap": [0, 2, 10, 18, 24, 88, 94], "overload": [0, 1], "overrid": [1, 15, 17, 32, 77, 82], "override_field": 79, "overshadow": 71, "oversubscrib": [64, 70], "overview": [3, 18, 23, 59, 60, 67, 68, 70, 89, 91], "overwhelm": 53, "overwrit": [5, 26], "own": [0, 1, 2, 8, 10, 13, 14, 15, 16, 17, 18, 32, 60, 90], "ownership": 0, "ownsev": 1, "ownsstream": 1, "p": [0, 6, 10, 16, 27, 50, 51, 52, 65, 79, 82, 88], "p2p": 77, "p50": [68, 69], "p90": [68, 69, 70], "p95": [68, 69, 70], "p99": [68, 69, 70], "p_max": 0, "p_x": 0, "pack": [0, 1, 6, 25, 59, 76, 77, 79, 84, 90], "packag": [3, 60, 61, 62, 68, 70, 87, 88], "packed_length": 79, "packedinput": 1, "packedmask": 1, "packedmaskhost": 1, "packedmaskhostcopi": 1, "packedmasksdevic": 1, "packedpositionid": 1, "pad": [0, 1, 6, 7, 9, 25, 26, 59, 65, 66, 77, 78, 82, 84, 88], "pad_id": [65, 82], "pad_lda": 78, "pad_ldc": 78, "pad_token_id": 82, "padding_2d": 77, "padding_back": 77, "padding_bottom": 77, "padding_front": 77, "padding_left": 77, "padding_mod": 78, "padding_right": 77, "padding_top": 77, "padid": 0, "page": [1, 2, 6, 8, 14, 21, 25, 59, 64, 68, 70, 72, 77, 83, 84, 88, 92], "paged_context_fmha": [72, 88], "paged_kv_cach": [9, 25, 68, 82], "paged_st": [25, 82], "pagedcontextfmha": 1, "pagedkvcach": 6, "pagedst": 1, "pageid": 1, "pageidx": 1, "pagemanagerconfig": 1, "pageptr": 1, "pagewidth": 1, "pair": [0, 1, 19, 72, 75, 77], "pale": 46, "paper": [2, 9, 10, 20, 85, 92], "par": [74, 75], "parallel": [0, 2, 3, 5, 6, 10, 13, 14, 18, 19, 21, 22, 26, 36, 37, 45, 47, 59, 65, 69, 72, 73, 77, 78, 79, 84, 88, 90, 94], "parallel_attent": [13, 79], "parallelconfig": [0, 88], "param": [0, 1, 15, 41, 42, 43, 45, 46, 54, 65, 77, 78, 79, 82], "paramet": [0, 1, 3, 4, 5, 8, 9, 10, 11, 13, 14, 15, 17, 18, 25, 26, 50, 65, 68, 71, 72, 73, 76, 77, 78, 79, 82, 84, 88, 92], "parametr": 82, "parent": [0, 1, 15, 17], "parent_hash": 46, "parenthash": 0, "parentid": 1, "pari": [38, 41, 42, 43, 44, 45, 54], "pars": [1, 65], "parse_arg": 49, "parser": [26, 49, 58], "part": [1, 3, 4, 7, 14, 15, 17, 59, 60, 64, 65, 66, 69, 74, 75, 76, 77, 82, 84], "part2": 88, "parti": 88, "partial": [0, 4, 8, 14, 24, 65, 71], "particip": [0, 54, 77, 88], "participantid": [0, 2], "particular": [0, 3, 64, 73, 74, 75, 83], "particularli": [24, 60, 75, 93], "partit": [5, 9, 14, 50, 51, 52], "pass": [0, 1, 3, 5, 7, 8, 9, 10, 14, 15, 32, 47, 53, 54, 65, 67, 68, 70, 72, 74, 75, 77, 78, 79, 82, 84, 88, 89, 90, 91, 92, 94], "past": [0, 5], "past_key_valu": [77, 78], "past_key_value_length": 78, "past_key_values_length": 78, "past_kv_length": 82, "past_sequence_length": 82, "patch": [78, 82], "patch_siz": [78, 79], "path": [0, 1, 3, 5, 10, 13, 15, 18, 25, 26, 32, 41, 42, 43, 44, 45, 49, 50, 51, 52, 54, 60, 64, 65, 67, 68, 69, 70, 72, 77, 82, 88], "path_to_llama_from_hf": 91, "path_to_meta_llama_from_hf": 64, "path_to_trt_engin": 64, "pathlib": [49, 65], "pathlik": 79, "pathorn": 88, "pathsoffset": 1, "pattern": [4, 24, 59, 65, 77, 88], "patternanalyz": 7, "patternrewrit": 7, "paus": [0, 76, 94], "paused_request": 94, "pcie": 25, "pdf": [0, 4, 9], "pdl": [24, 88], "peak": [0, 18, 19, 20, 24, 69], "peft": 65, "peft_cache_config": [32, 44, 65], "peftcacheconfig": [0, 65], "peftcachemanag": [0, 88], "penal": [0, 6, 65], "penalti": 88, "penalty_alpha": 6, "pend": 94, "pending_request": 94, "per": [0, 1, 3, 5, 6, 10, 14, 17, 18, 19, 21, 22, 24, 25, 26, 50, 51, 52, 65, 68, 69, 70, 71, 72, 77, 78, 84, 85, 88], "per_channel": 85, "per_group": 85, "per_token": 85, "per_token_scal": 77, "perceiv": 20, "percent": [0, 11], "percentag": [9, 11, 68, 69, 70], "percentil": [68, 88], "perf": [0, 18, 26, 58, 65, 77, 88], "perf_best_practic": 88, "perform": [0, 1, 2, 3, 5, 6, 7, 9, 14, 15, 16, 17, 19, 21, 22, 25, 26, 32, 60, 64, 65, 66, 68, 69, 71, 74, 76, 77, 82, 83, 86, 88, 90, 92, 93], "performantli": 19, "permut": 77, "persimmon": 88, "persist": [23, 64], "person": [27, 53], "phase": [0, 2, 7, 10, 19, 22, 24, 25, 59, 68, 73, 74, 75, 76, 77, 84, 88, 92, 93], "phi": [64, 77, 85, 86, 88], "phi3config": 79, "phi3forcausallm": 79, "phi3model": 79, "phiconfig": 79, "phiforcausallm": 79, "phimodel": 79, "physic": [77, 84], "picasso": 54, "pick": 74, "pickl": 88, "piec": 74, "pin": [0, 1, 8], "ping": 88, "pinnedmemusag": 0, "pinnedpool": 1, "pip": [18, 26, 60, 61, 62, 83, 88], "pip3": [61, 62], "pipelin": [0, 1, 3, 6, 14, 19, 22, 25, 26, 45, 65, 68, 69, 73, 84, 88, 94], "pipeline_parallel_s": [45, 65, 71, 72], "pipelineparallel": [0, 1, 6], "pipelineparallelismrank": 1, "pitfal": [8, 17], "pixart": 78, "pixartalphatextproject": 78, "pixel_valu": 79, "pl": [62, 68], "place": [1, 25, 46, 62, 77, 88, 90], "placement": 24, "plai": 74, "plan": [3, 5, 24, 60], "planner": 88, "platform": [27, 28, 38, 41, 42, 44, 45, 60, 66, 68, 88, 89], "pleas": [2, 5, 7, 10, 12, 19, 21, 22, 23, 24, 28, 32, 40, 60, 62, 68, 69, 71, 73, 77, 87, 88, 89, 94], "plu": 82, "plugin": [5, 6, 7, 11, 13, 59, 60, 65, 74, 77, 79, 83, 84, 85, 87, 88], "plugin_config": [65, 72, 75, 77, 79], "plugin_namespac": 7, "plugin_typ": 7, "plugin_v2": 7, "plugin_v2_gemm_0": 87, "pluginconfig": [65, 80], "pluginconfigmeta": 80, "pluginfield": 88, "pluginv2build": 87, "pm": [18, 24, 68], "pmi": 87, "pmi2_init": 87, "pmix": [14, 26, 50, 51, 52, 87], "png": [30, 35, 56], "po": 78, "point": [1, 5, 14, 16, 20, 23, 36, 40, 45, 61, 62, 64, 65, 69, 71, 76, 77, 83, 85, 87, 88], "pointer": [0, 1, 6, 15, 77, 82, 88], "pointerelementtyp": 1, "polar": 86, "polici": [0, 1, 2, 65, 68, 70, 84], "poll": [0, 26], "polyhedr": 14, "pong": 88, "pool": [0, 1, 5, 59, 77, 82, 93, 94], "pooled_project": [78, 79], "pooled_projection_dim": 78, "pooledpin": 0, "poor": 2, "popd": 87, "popfirstgentoken": 0, "popul": [1, 5, 14, 54, 77], "popular": [5, 13, 17, 23, 28, 64], "port": [0, 26, 28, 33], "portfolio": 21, "portion": [4, 71, 77, 84], "pos_emb_typ": 77, "pos_embd_param": 92, "pos_embed_max_s": 78, "pos_embed_typ": 78, "pose": 75, "posit": [0, 1, 10, 24, 65, 68, 77, 78, 82, 88, 92], "position_embed": [77, 78], "position_embedding_typ": [5, 13, 77, 78, 79], "position_encoding_2d": 79, "position_id": [79, 82, 87, 90, 92], "positionalembeddingparam": 92, "positionembeddingtyp": [5, 77, 78, 79], "positionid": [0, 1], "positionidsbas": 1, "positionidsdevic": 1, "positionidshost": 1, "positionidshostcopi": 1, "positionoffset": 1, "positionoffsetsdevic": 1, "positionoffsetshost": 1, "positionoffsetshostcopi": 1, "possibl": [2, 3, 5, 6, 8, 10, 14, 18, 25, 32, 60, 66, 67, 68, 69, 72, 74, 76, 77, 84, 87, 88, 91], "possibli": [1, 77], "post": [0, 13, 20, 23, 24, 54, 66, 67, 77, 83, 88], "post_act_fn": 78, "post_attention_layernorm": [15, 90], "post_input_id": 82, "post_layernorm": [12, 13, 15, 77, 87], "post_pad": 77, "post_prompt": 82, "post_strid": 77, "posterior_threshold": [39, 65], "posterioralpha": 1, "posterioralphahost": 1, "posteriorthreshold": [0, 1], "posteriorthresholdhost": 1, "postprocess": [26, 78], "postprocessor": [0, 65], "postprocparam": 65, "potenti": [0, 1, 10, 25, 67, 68, 72, 90], "pow": 77, "power": [8, 14, 21, 23, 24, 66, 74, 88], "pp": [0, 2, 6, 9, 19, 22, 26, 68, 70, 77, 88], "pp2": 68, "pp_communicate_final_output_id": 82, "pp_communicate_new_token": 82, "pp_reduce_scatt": [25, 75], "pp_size": [13, 14, 26, 33, 68, 69, 71, 81, 88], "ppreducescatt": 1, "pr": 24, "practic": [5, 14, 20, 21, 24, 83, 84, 88], "pre": [0, 1, 3, 5, 13, 16, 60, 62, 65, 66, 68, 77, 83, 84, 88, 92], "pre_input_id": 82, "pre_layernorm": 77, "pre_onli": 78, "pre_pad": 77, "pre_prompt": 82, "pre_quant_scal": [13, 65], "pre_strid": 77, "prebuilt": 60, "preced": [14, 77], "precis": [1, 6, 15, 19, 23, 25, 59, 68, 72, 75, 80, 83, 84, 86, 88], "precompute_relative_attention_bia": 79, "precomputed_relative_attent": 78, "predefin": [10, 90, 92], "predict": [1, 5, 10, 24, 88], "predicteddraftlogit": 1, "predictor": 10, "predictsdrafttoken": 1, "prefer": [23, 60], "prefer_managed_weight": 78, "prefer_plugin": 77, "prefetch": 24, "prefil": [0, 65, 73], "prefix": [3, 10, 13, 46, 64, 70, 77, 80, 87], "preliminari": [19, 21, 22], "preload": 15, "prepar": [0, 1, 2, 24, 46, 51, 59, 67, 74, 77, 79, 85, 88, 92], "prepare_dataset": [18, 51, 67, 68, 69, 70], "prepare_input": [79, 84], "prepare_position_ids_for_cogvlm": 82, "prepare_recurrent_input": 79, "prepare_resourc": [91, 93], "prepareforward": 1, "prepend": 87, "preprocess": [15, 82, 85], "preprocess_weights_hook": 79, "preprocessor": 68, "prequant_scaling_factor": 13, "prerequisit": [59, 61, 62], "presenc": [6, 14, 46], "presence_penalti": [65, 82, 88], "presencepenalti": [0, 1, 6], "present": [0, 65, 68, 74, 75, 85, 88], "preserv": 72, "presid": [36, 38, 39, 41, 42, 43, 44, 45, 47, 49, 54, 61, 62, 70, 76, 83, 89], "pretrain": 16, "pretrained_config": 90, "pretrained_model_name_or_path": 79, "pretrainedconfig": [12, 17, 65, 79, 80, 90], "pretrainedmodel": [17, 79, 84], "pretrainedtokenizerbas": 65, "prevdrafttokenslen": 1, "prevent": [24, 59, 64], "preview": 88, "previou": [1, 3, 4, 10, 17, 18, 20, 68, 70, 71, 72, 74, 75, 76, 88], "previous": [1, 19, 72, 74, 76, 88], "prevscor": 1, "prewritten": 83, "price": 68, "primari": [0, 1, 23, 84, 94], "primarili": 92, "primit": [14, 66, 83], "print": [1, 5, 26, 32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 54, 55, 56, 57, 61, 62, 68, 69, 70, 76, 83, 84, 87, 89], "print_iter_log": [18, 51], "prior": [3, 25, 60, 62], "priorit": [23, 74, 76], "prioriti": [0, 1, 8, 15, 65], "prioritytyp": 0, "priorityupd": 0, "privat": [0, 1, 6], "privileg": 7, "prm": 86, "pro": 24, "prob": 77, "probabilist": 78, "probabl": [0, 1, 6, 8, 10, 24, 65, 77, 82, 88], "probil": 1, "problem": [5, 18, 87], "proc": 15, "proccessed_weight": 15, "proccessed_zero": 15, "procedur": 18, "proceed": 14, "process": [0, 1, 2, 3, 5, 6, 10, 13, 14, 17, 18, 24, 25, 36, 40, 45, 47, 50, 51, 52, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 74, 75, 76, 77, 82, 83, 87, 88, 90, 91, 92, 94], "process_input": 82, "process_logits_including_draft": 82, "processor": [0, 5, 36, 37, 48, 65, 79, 82, 88], "processorbatch": 0, "processormap": 0, "prod": 77, "produc": [0, 1, 3, 7, 14, 32, 68, 70, 72, 74, 75, 77, 88], "product": [4, 5, 10, 14, 21, 66, 74, 75, 76, 77, 83, 92], "profil": [2, 25, 26, 34, 35, 59, 72, 74, 77, 82, 84, 87, 88], "profiling_verbos": [25, 65], "profit": [10, 68], "program": [2, 17, 36, 38, 41, 42, 44, 45, 47, 61, 62, 64, 76, 83, 87], "progress": [1, 24, 65, 68, 77], "proj": [13, 15, 87], "project": [5, 9, 54, 60, 77, 78, 90, 93], "projector_hidden_act": 79, "prologu": [50, 51, 52], "promin": 10, "promis": [10, 17], "prompt": [0, 3, 6, 8, 12, 18, 25, 26, 31, 32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 53, 54, 57, 59, 61, 62, 65, 68, 70, 74, 75, 76, 78, 82, 83, 88, 89, 92], "prompt_adapter_request": [65, 88], "prompt_embedding_t": [78, 79, 82], "prompt_embedding_table_s": 79, "prompt_id": 47, "prompt_len": 92, "prompt_logprob": 65, "prompt_lookup": [10, 88], "prompt_lookup_num_token": 6, "prompt_tabl": 82, "prompt_task": [79, 82], "prompt_token": 83, "prompt_token_id": [32, 48, 65], "prompt_vocab_s": [79, 82], "promptadapterrequest": 65, "promptinput": [65, 88], "promptlen": 0, "prompttableoffload": 0, "prompttuningconfig": 0, "prompttuningembed": 78, "prompttuningen": 1, "pronounc": 10, "proof": 93, "propag": [8, 88], "proper": [2, 68], "properli": [15, 74, 76], "properti": [3, 40, 65, 77, 79, 80, 82], "proport": 5, "propos": 24, "protect": [1, 36, 45, 61, 62, 64, 83], "protocol": [0, 26, 40], "proud": 24, "prove": 10, "provid": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 16, 17, 18, 19, 20, 23, 24, 25, 26, 27, 32, 40, 49, 54, 60, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 82, 84, 86, 87, 88, 90, 91, 92], "proxy_dispatch_result_thread": 68, "prune": [7, 10, 77], "pseudo": [5, 77, 85], "pth": [15, 88], "ptq": [23, 72, 88], "ptr": 1, "ptr_idx": 15, "ptrdiff_t": 1, "ptuning_setup": 82, "ptuning_setup_fuyu": 82, "ptuning_setup_llava_next": 82, "ptuning_setup_phi3": 82, "ptuning_setup_pixtr": 82, "ptuningconfig": 0, "public": [0, 1, 23, 28, 49, 54], "publish": [18, 19, 22, 68, 69, 88], "pull": [16, 18, 60, 83, 88], "puneeshkhanna": 88, "purchas": 68, "pure": 82, "purpos": [5, 60, 70, 72, 74, 75], "pursu": [38, 41, 42, 44, 45, 47], "push": [27, 48], "pushd": 87, "put": [1, 13, 24, 50, 51, 52, 64, 66, 74], "pwd": [18, 60], "py": [3, 4, 5, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 24, 47, 50, 51, 60, 62, 64, 67, 68, 69, 70, 71, 72, 77, 80, 82, 83, 87, 88, 90, 91, 93, 94], "py3": 88, "py_executor_cr": 94, "pybind": 88, "pybind11_object": 65, "pybindmirror": 65, "pydant": [65, 88], "pydantic_cor": 65, "pyexecutor": [46, 88, 93, 94], "pynvml": 88, "pypi": [60, 88], "python": [1, 5, 6, 7, 9, 10, 12, 14, 16, 17, 18, 26, 32, 42, 43, 59, 61, 62, 64, 67, 68, 69, 70, 71, 83, 85, 88, 90, 91, 93, 94], "python3": [9, 11, 13, 18, 50, 51, 60, 62, 67, 68, 83, 87], "python_bind": 18, "python_e2": 82, "python_plugin": 88, "pythonpath": [18, 51, 52], "pytorch": [7, 10, 13, 16, 18, 26, 33, 46, 50, 51, 52, 59, 60, 61, 62, 65, 69, 77, 88, 91, 92, 93, 94], "pytorch_backend_config": [18, 26, 46, 51, 68, 69, 92], "pytorch_config": [46, 92], "pytorch_eagle_weights_path": 65, "pytorch_extra_arg": 51, "pytorch_model": 87, "pytorch_model_engin": 91, "pytorch_model_registri": 93, "pytorchconfig": [46, 92], "pytorchmodelengin": [91, 93], "pzzzzz5142": 88, "q": [2, 5, 6, 9, 19, 24, 59, 68, 77, 87, 90, 92], "q_b_proj": 77, "q_dim": 77, "q_lora_rank": [77, 78], "q_proj": [15, 90], "q_scale": [5, 77, 78, 79], "qa": 10, "qformat": [68, 81], "qgmma": 88, "qingquansong": 88, "qk_layernorm": [78, 79], "qk_nope_head_dim": [77, 78], "qk_norm": 78, "qk_rope_head_dim": [77, 78], "qkv": [7, 9, 13, 15, 59, 77, 87, 88, 92], "qkv_bia": [77, 88], "qkv_dim": 77, "qkv_proj": 90, "qo_indptr": 92, "qserv": 88, "quadrat": [5, 84], "qualiti": [72, 75], "qualnam": [65, 77, 79, 81], "quant": [17, 65, 68, 77, 88, 89], "quant_algo": [13, 15, 17, 32, 54, 65, 68, 72, 79], "quant_and_calib_config": 54, "quant_config": [17, 32, 54, 65, 72, 79, 92], "quant_medusa_head": 81, "quant_mod": [17, 65, 78, 79, 82], "quantalgo": [32, 54, 65, 72, 79, 81], "quantconfig": [17, 32, 54, 65, 72, 79, 88, 92], "quanticonfig": 17, "quantiz": [5, 6, 14, 15, 18, 19, 20, 24, 25, 36, 37, 41, 49, 59, 62, 63, 64, 65, 66, 69, 70, 73, 77, 78, 79, 82, 83, 86, 88, 90, 92], "quantizaton": 68, "quantize_and_export": 81, "quantize_kwarg": 79, "quantize_lm_head": [81, 88], "quantized_valu": 5, "quantizedkernel": 14, "quantizetensorplugin": 14, "quantmod": [1, 5, 6, 59, 65, 77, 78, 79, 81, 82], "quantmodewrapp": [65, 77], "queri": [3, 6, 10, 14, 19, 26, 59, 68, 77, 84, 92, 93], "query_dim": 78, "query_key_valu": 15, "query_length": 78, "query_pre_attn_scalar": 79, "question": [53, 68, 84, 87], "queu": [0, 69, 74], "queue": [0, 65, 66, 91], "quick": [5, 59, 66, 68, 70, 92], "quick_gelu": 77, "quicker": 71, "quickli": [17, 83], "quickstart": [64, 70], "quickstart_advanc": 50, "quit": [7, 64], "qweight": 15, "qwen": [15, 26, 35, 64, 68, 77, 85, 86, 88], "qwen1": [86, 88], "qwen2": [9, 26, 30, 35, 56, 68, 86, 88], "qwen2_5_vlforconditionalgener": 86, "qwen2audio": 88, "qwen2forcausallm": 86, "qwen2forprocessrewardmodel": 86, "qwen2forrewardmodel": 86, "qwen2forsequenceclassif": 88, "qwen2vl": 88, "qwen2vlforconditionalgener": 86, "qwenforcausallm": 15, "qwenforcausallmgenerationsess": 82, "qwenvl": 88, "qwq": 86, "qychen": 9, "qzero": 15, "r": [1, 9, 26, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 54, 61, 62, 70, 76, 77, 83, 87, 88, 89], "r1": [26, 58, 69, 88], "r1_in_tensorrt": [24, 88], "race": 88, "radix": 93, "rai": 1, "rais": [17, 65, 70, 87, 88], "rand": 77, "rand_data": 77, "rand_data_sampl": 79, "rand_data_valid": 79, "random": [0, 6, 26, 34, 35, 65, 69, 77, 88], "random_se": [65, 79, 82], "randomdatasampl": 1, "randomdatavalid": 1, "randomli": 69, "randomse": [1, 6, 88], "randomseedtyp": 0, "rang": [0, 6, 8, 10, 67, 75, 77, 79, 84, 85, 86, 87, 90], "rank": [0, 1, 2, 3, 4, 6, 9, 17, 18, 25, 64, 68, 77, 79, 82, 84, 87, 88], "rank0": 13, "rank1": 13, "rapid": [10, 69, 83], "rate": [0, 18, 24, 26, 34, 35, 68, 69, 70, 88], "rather": [5, 7, 10, 62, 66], "raw": 26, "raw_audio": 82, "raw_imag": 82, "rdma": 2, "re": [18, 23, 65, 66, 88, 92], "reach": [0, 5, 13, 64, 68, 72, 76], "read": [2, 3, 5, 10, 12, 14, 15, 18, 24, 25, 53, 68, 88], "read_config_from_the_custom_training_checkpoint": 17, "readabl": 68, "reader": 77, "readi": [0, 83], "readm": [2, 10, 26, 64, 70, 88], "real": [7, 18, 24, 60, 70, 72, 74, 75, 77, 87], "realiti": 74, "realiz": [8, 10], "rearrang": 77, "reason": [0, 5, 6, 14, 17, 24, 26, 58, 65, 68, 71, 74, 75, 77, 87], "reasoning_pars": [26, 33], "rebuild": [75, 77, 87], "receiv": [0, 1, 2, 3, 4, 10, 72, 77, 88], "recent": [1, 4, 5, 20, 24], "recip": [24, 26, 65, 85], "reclaim": 0, "recogn": [10, 24, 68, 90], "recommend": [2, 5, 6, 10, 12, 15, 16, 18, 20, 23, 26, 47, 60, 65, 68, 73, 74, 76, 87, 88, 90, 92], "recompute_scale_factor": 77, "reconfigur": [3, 62], "reconstruct": [5, 77], "record": [1, 7, 18, 24, 65], "recored": 0, "recreat": 16, "recurr": 10, "recurrentgemma": [85, 86, 88], "recurrentgemmaforcausallm": 79, "recurs": [18, 60, 64], "recv": [0, 14, 77], "recvconnect": 0, "recvpollperiodm": 0, "recycl": [5, 93], "redesign": 88, "redirect": [7, 65], "redraft": [59, 77, 82, 88], "redrafter_draft_len_per_beam": 82, "redrafter_inverted_temperatur": 79, "redrafter_num_beam": 82, "redrafterforcausallm": 79, "reduc": [2, 3, 4, 5, 8, 10, 14, 18, 19, 22, 24, 25, 60, 64, 66, 67, 68, 69, 70, 71, 74, 76, 77, 84, 87, 88, 92], "reduce_fus": [25, 68, 72, 75], "reduce_scatt": 77, "reduceoper": 77, "reducescatt": [25, 75, 88], "reduct": [10, 24, 76, 77], "redund": [10, 24], "refactor": [17, 88], "refer": [0, 1, 2, 3, 5, 6, 7, 9, 10, 14, 16, 17, 18, 26, 28, 29, 30, 31, 32, 33, 34, 35, 36, 47, 55, 56, 57, 60, 64, 66, 68, 69, 70, 71, 72, 73, 75, 77, 83, 86, 88, 90, 92], "referenc": 72, "reference_wrapp": [0, 3], "refin": 88, "refit": [14, 25, 88], "refit_engin": 14, "reflect": 74, "refresh": 68, "regard": 77, "regardless": 87, "regex": [3, 65], "region": 67, "regist": [27, 59, 87, 88, 90], "register_auto_model": 90, "register_network_output": 87, "regress": [5, 6, 14], "regular": [0, 3, 5, 24, 65, 77], "reinforc": 73, "reject": 0, "rel": [8, 19, 74, 76, 77, 88], "rel_attn_t": 78, "relat": [2, 4, 15, 59, 66, 67, 77, 80, 84, 87, 88, 89, 90, 93], "relationship": 84, "relative_attent": [77, 78], "relative_attention_bia": 77, "relax": 5, "relaxed_delta": [24, 65], "relaxed_topk": [24, 65], "releas": [1, 5, 6, 17, 19, 22, 23, 59, 66, 77, 79, 84, 85, 86], "release_build": 60, "release_run": [60, 83], "releasepag": 1, "releasest": 0, "relev": [6, 60, 93], "reli": [2, 5, 7, 17, 64, 67, 85], "reload": 3, "relu": [13, 14, 77, 87], "remain": [0, 7, 8, 10, 11, 24, 60, 69, 70, 72, 74, 75, 77, 84, 88], "remaind": 72, "remark": 24, "remind": [5, 92], "remot": 65, "remov": [0, 1, 5, 6, 7, 14, 15, 18, 25, 26, 49, 60, 65, 66, 72, 77, 84, 88, 90], "remove_const_t": 1, "remove_cv_t": 0, "remove_duplicated_kv_head": 79, "remove_input_pad": [5, 9, 25, 77, 78, 82], "remove_pointer_t": 1, "remove_reference_t": 1, "remove_sequ": 93, "renam": 88, "reorder": [77, 78], "reorder_kv_cache_for_beam_search": 82, "rep": 67, "repeat": [0, 5, 65, 77], "repeat_interleav": 77, "repeatedli": 10, "repetit": [0, 6, 65, 77], "repetition_penalti": [6, 65, 82, 88], "repetitionpenalti": [0, 1, 6], "replac": [1, 4, 7, 14, 15, 17, 18, 68, 70, 72, 76, 77, 84, 90], "replace_add_with_sub": 7, "replace_all_uses_with": [7, 77], "replace_input_with": 7, "replace_output_uses_with": 7, "replace_outputs_uses_with": 7, "replic": [0, 3, 24, 77], "replit": [85, 86, 88], "repo": [17, 64, 66, 70, 87], "repo_id": 53, "report": [67, 68, 69, 84, 88], "reportpluginerror": 87, "repositori": [10, 16, 18, 27, 64, 83], "repres": [0, 1, 2, 10, 18, 19, 23, 24, 40, 53, 65, 68, 74, 77, 82, 94], "represent": [7, 14], "reproduc": [59, 68, 88], "req": [18, 68, 69, 70, 72, 74, 75], "req_id": 47, "req_logit": 47, "req_stat": 94, "req_token_id": 47, "reqbeamwidth": 1, "reqid": 0, "reqpromptlength": 1, "request": [0, 2, 5, 6, 8, 9, 14, 18, 20, 22, 25, 26, 34, 35, 47, 51, 65, 66, 67, 68, 69, 70, 72, 74, 75, 76, 77, 83, 84, 88, 91, 92, 93, 94], "request_id": [32, 48, 65, 92], "request_stats_max_iter": 65, "request_timeout": 26, "request_typ": 65, "request_type_context_and_gener": [0, 2], "request_type_context_onli": [0, 2], "request_type_generation_onli": [0, 2], "requesterror": 65, "requestid": [0, 2, 3], "requestidtyp": 0, "requestlist": 94, "requestoutput": [32, 48, 65, 88], "requestperfmetr": 0, "requestschedul": 94, "requeststag": 0, "requeststat": 0, "requeststatsmaxiter": 0, "requeststatsperit": 0, "requeststatsperiter": 0, "requeststatsvec": 0, "requesttoken": 3, "requesttyp": [0, 1, 2, 65], "requesttypesdevic": 1, "requestvector": 1, "requir": [0, 2, 5, 6, 8, 9, 10, 14, 15, 17, 18, 19, 23, 24, 25, 26, 40, 53, 60, 61, 62, 65, 68, 69, 70, 71, 72, 75, 77, 78, 83, 84, 86, 87, 88, 93], "require_ln_f": 79, "requiresattentionmask": 1, "rerun": 75, "rescale_output_factor": 78, "research": [5, 28, 38, 41, 42, 44, 45, 85], "resembl": 46, "reserv": [0, 1, 26, 65, 76, 82, 84, 94], "reserved_block": 94, "reset": [0, 1, 6, 65, 68, 82], "resetspeculativedecodingmodul": 1, "reshap": [1, 77], "resid": [9, 54], "residu": [77, 87], "residual_connect": 78, "residual_mlp": 79, "residual_multipli": 79, "residual_rms_norm": 77, "residual_rms_norm_out_quant_fp8": 77, "residual_rms_norm_out_quant_nvfp4": 77, "residual_rms_norm_quant_fp8": 77, "residual_rms_norm_quant_nvfp4": 77, "residual_rms_prepost_norm": 77, "residualadd": [25, 75, 88], "resiz": 1, "resolv": [30, 56, 87], "resourc": [0, 2, 5, 17, 24, 91, 93, 94], "respect": [4, 32, 76, 77, 82, 84, 85, 90, 94], "respons": [0, 2, 26, 32, 55, 56, 57, 65, 68, 77, 91], "responsewithid": 0, "rest": [1, 5, 72], "restart": 0, "restrict": [0, 2, 3, 6, 60, 65, 77], "result": [0, 1, 4, 5, 10, 14, 19, 20, 21, 23, 25, 32, 59, 60, 65, 68, 71, 72, 73, 74, 75, 77, 78, 88, 90, 92, 94], "retail": 68, "retain": [19, 21], "retent": [0, 65], "retentionprior": 0, "retentionpriorityanddur": 0, "rethink": 10, "retriev": [1, 15, 65, 69, 77], "return": [0, 1, 3, 7, 9, 10, 12, 14, 15, 17, 32, 65, 68, 74, 77, 78, 79, 82, 84, 87, 88, 93, 94], "return_all_generated_token": 82, "return_context_logit": 65, "return_dict": 82, "return_encoder_output": [65, 82], "return_generation_logit": 65, "return_perf_metr": 65, "returnallgeneratedtoken": [0, 3], "returncontextlogit": 0, "returnencoderoutput": 0, "returngenerationlogit": 0, "returnlogprob": 0, "returnperfmetr": 0, "reus": [0, 2, 3, 25, 59, 63, 65, 77, 82, 84, 88, 90, 93], "reusabl": 8, "reusedblock": 0, "reusedblocksperrequest": 0, "reveal": 24, "revers": 77, "revert": 77, "review": 68, "revis": 65, "revolution": 66, "rewind": 88, "rewrit": [59, 77, 88, 90], "rewritepatternmanag": 7, "rewrt": 87, "rf": 87, "rg_lru": 77, "rgc": 68, "rh": [0, 1], "rich": 13, "right": [66, 72, 77, 87], "rigor": [46, 68], "risk": [2, 14, 72, 76], "rm": [60, 77, 86, 87, 90], "rms_norm": [24, 77, 90], "rmsnorm": [9, 24, 77, 78, 79, 88, 90], "rnn": [25, 88], "rnn_conv_dim_s": 82, "rnn_head_siz": 82, "rnn_hidden_s": 82, "rnn_state": 79, "rnnconfig": 1, "rnnconvdims": 1, "rnnheadsiz": 1, "rnnhiddens": 1, "ro": 18, "roberta": [86, 88], "robertaforquestionansw": 79, "robertaforsequenceclassif": 79, "robertamodel": 79, "robin": 2, "robust": [24, 88], "rock": 77, "role": [14, 26, 29, 30, 40, 55, 56, 74, 83], "roll": 59, "root": [13, 18, 27, 60, 62, 64, 65, 70, 77, 83], "root_lay": 7, "rope": [24, 77, 82, 88, 92], "rope_gpt_neox": [5, 77, 79], "rope_gptj": [5, 77], "rope_local_base_freq": 79, "rope_scaling_config": 77, "rope_scaling_long_factor": 78, "rope_scaling_long_mscal": 78, "rope_scaling_short_factor": 78, "rope_scaling_short_mscal": 78, "ropeembeddingutil": 77, "rotari": [0, 24, 77, 82, 90, 92], "rotary_bas": 79, "rotary_cos_sin": 77, "rotary_dim": 79, "rotary_embed": 90, "rotary_embedding_bas": [77, 78], "rotary_embedding_base_loc": 78, "rotary_embedding_beta_fast": 78, "rotary_embedding_beta_slow": 78, "rotary_embedding_dim": [5, 77, 79], "rotary_embedding_long_m_scal": 77, "rotary_embedding_max_posit": 77, "rotary_embedding_mscal": 78, "rotary_embedding_mscale_all_dim": 78, "rotary_embedding_origin_max_posit": 78, "rotary_embedding_original_max_posit": 77, "rotary_embedding_percentag": 78, "rotary_embedding_sc": 78, "rotary_embedding_scal": 77, "rotary_embedding_scale_typ": 77, "rotary_embedding_short_m_scal": 77, "rotary_inv_freq": [77, 78], "rotary_inv_freq_loc": 78, "rotary_pct": 79, "rotary_sc": [78, 79], "rotaryembed": 90, "rotaryembeddingdim": [0, 1], "rotaryscalingtyp": 77, "rotate_every_two": 77, "rotate_half": 77, "round": [2, 77], "rout": 2, "router": [4, 9, 88], "router_gemm": 24, "routin": 7, "routingkernel": 24, "row": [9, 74, 77, 85, 88], "rowlinear": [9, 78], "rowwis": 65, "rr": 88, "rslora": 88, "rst": 3, "rtx": 88, "rubric": 77, "rule": [5, 71, 87], "run": [0, 1, 2, 3, 5, 6, 8, 10, 12, 13, 14, 19, 23, 24, 25, 26, 27, 28, 42, 43, 47, 50, 51, 52, 59, 60, 61, 62, 64, 65, 66, 71, 72, 74, 75, 76, 77, 79, 82, 84, 85, 87, 88, 90, 91, 92, 93], "run_dtm_pld": 10, "run_medusa_decod": 49, "runner": [0, 13, 82], "runningleon": 88, "runpod": 27, "runtim": [0, 3, 5, 10, 11, 16, 24, 25, 26, 44, 47, 53, 59, 65, 66, 67, 68, 70, 73, 74, 77, 78, 79, 83, 87, 88, 90, 92, 94], "runtime_config": [32, 44], "runtime_default": 79, "runtime_error": 1, "runtime_rank": 82, "runtimedefault": [0, 79], "runtimedefaultsin": 79, "runtimeerror": [64, 65, 87], "runtimetensor": 82, "s0": 5, "s1": 5, "s2": 5, "sacrif": 24, "sad": 82, "saeyoonoh": 88, "safe": [1, 7, 75], "safer": 77, "safetensor": [13, 15, 87, 88], "sage_attn": 77, "sage_attn_k_block_s": 77, "sage_attn_k_quant_s": 77, "sage_attn_q_block_s": 77, "sage_attn_q_quant_s": 77, "sage_attn_v_block_s": 77, "sage_attn_v_quant_s": 77, "sageattent": 77, "sai": [67, 70, 74], "said": 72, "sake": 74, "sale": 68, "same": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 17, 20, 25, 47, 50, 51, 52, 60, 64, 68, 69, 72, 75, 76, 77, 78, 80, 82, 84, 88], "sampl": [0, 1, 3, 5, 14, 16, 18, 24, 39, 41, 42, 43, 44, 45, 46, 47, 49, 53, 54, 59, 63, 65, 67, 68, 69, 77, 78, 82, 88], "sample_proj_bia": 78, "sample_weight_strip": 88, "samplemod": 77, "sampling_config": 82, "sampling_param": [32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 54, 61, 62, 65, 70, 76, 83, 88, 89], "samplingconfig": [0, 3, 6, 32, 82, 88], "samplingparam": [32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 54, 61, 62, 65, 70, 76, 83, 88, 89], "saniti": [61, 62, 71, 72, 75], "santacod": [64, 85, 86], "satfinit": 85, "satisfi": [6, 15, 88], "save": [5, 8, 10, 17, 18, 25, 27, 41, 44, 64, 65, 67, 68, 72, 75, 76, 84, 88], "save_checkpoint": [17, 79], "save_config": [17, 79], "saw": [72, 83], "sbatch": [14, 50, 51, 52], "sbsa": [88, 89], "scaffold": [88, 90], "scalar": [6, 77], "scalartyp": 88, "scale": [0, 6, 9, 15, 25, 65, 72, 77, 78, 85, 88], "scale_d0": 77, "scale_d1": 77, "scale_factor": 77, "scale_output": 77, "scale_qk": 78, "scale_typ": 77, "scalia": [38, 41, 42, 44, 45], "scaling_factor": 77, "scaling_long_factor": 77, "scaling_short_factor": 77, "scalingvecpoint": 1, "scanreducetempstorag": 1, "scanreducetempstoragebyt": 1, "scantempstorag": 1, "scantempstoragebyt": 1, "scatter": [7, 77], "scatter_nd": 77, "scenario": [2, 5, 10, 13, 18, 21, 23, 24, 25, 28, 68, 69, 70, 72, 74, 75, 88], "scfg": 82, "schedul": [0, 2, 3, 8, 9, 18, 25, 26, 46, 65, 68, 70, 75, 84, 88, 89], "schedule_request": 94, "scheduled_request": 94, "scheduler_config": [65, 76], "schedulerconfig": [0, 65, 76, 88], "schedulerpolici": 88, "schema": [0, 3, 40, 65, 68], "scheme": 0, "scicod": 24, "scienc": [38, 41, 42, 44, 45, 47], "scope": [16, 88], "score": 6, "scout": 86, "scratch": [68, 70, 71, 75], "script": [9, 12, 14, 17, 18, 27, 50, 51, 52, 60, 64, 67, 68, 69, 70, 80, 85, 87, 88, 89, 90], "sd3": 78, "sd35adalayernormzerox": 78, "sd3patchemb": 78, "sd3transformer2dmodel": 79, "sd3transformer2dmodelconfig": 79, "sdxl": 88, "seamless": 88, "search": [0, 1, 3, 6, 10, 16, 22, 25, 26, 32, 44, 59, 65, 72, 74, 77, 88, 91], "seashor": [30, 56], "seat": [38, 41, 42, 44, 45], "sec": [18, 20, 68, 69, 70, 72, 74, 75], "second": [1, 3, 6, 8, 9, 10, 18, 19, 21, 22, 24, 65, 74, 77], "secondari": [0, 65, 84], "secondary_offload_min_prior": 65, "secondaryoffloadminprior": 0, "secondli": 74, "section": [3, 6, 14, 15, 17, 18, 26, 60, 64, 66, 68, 70, 72, 73, 74, 75, 77, 83, 86, 88, 92], "section_s": 77, "secur": [40, 88], "securityprotocol": 40, "see": [0, 1, 5, 6, 10, 14, 15, 18, 19, 21, 22, 23, 26, 27, 28, 30, 36, 56, 62, 68, 69, 70, 72, 74, 75, 76, 77, 78, 79, 84, 85, 87, 88], "seed": [0, 6, 26, 34, 35, 65, 81, 88], "seem": [8, 46, 53, 68, 71], "seen": [10, 18, 68], "segment": 88, "select": [0, 4, 6, 16, 23, 24, 25, 68, 75, 77, 82, 84, 91, 94], "selectcontextid": 0, "selectgenidx": 0, "selective_scan": 77, "self": [0, 5, 7, 12, 14, 15, 47, 65, 68, 77, 79, 82, 87, 90, 93, 94], "self_attent": 15, "self_attention_mask": 78, "self_attention_packed_mask": 78, "self_attn": [15, 90], "selfidx": 0, "sell": 68, "semicolon": 60, "senat": [38, 41, 42, 44, 45], "send": [0, 2, 14, 24, 26, 70, 71, 77, 83, 88], "sens": 72, "sensit": [24, 72], "sent": [0, 10, 26], "sentenc": [0, 6, 65, 83], "separ": [10, 25, 49, 60, 68, 77, 82, 92], "separate_match_rewrit": 7, "seq": [1, 5, 68, 77], "seq_idx": 82, "seq_len": [69, 77, 78, 92], "seq_length": 77, "seq_lens_cuda": 92, "seqlen": [0, 77], "seqslot": 1, "sequenc": [0, 1, 3, 5, 6, 7, 8, 10, 14, 18, 19, 20, 21, 22, 24, 65, 66, 68, 69, 70, 73, 76, 77, 78, 82, 84, 88, 92, 93], "sequence_length": [77, 78, 82, 87], "sequence_length_buff": 82, "sequence_limit_length": 82, "sequenceindex": [0, 3], "sequencelengthscba": 1, "sequencelimitlength": 1, "sequenti": [0, 2, 10, 84], "seri": 88, "serial": [25, 77, 79, 82], "serializ": 65, "serialize_engin": 82, "serializeds": 0, "serializedst": 0, "serv": [0, 2, 3, 5, 10, 14, 16, 22, 23, 29, 30, 31, 33, 34, 35, 36, 37, 55, 56, 57, 59, 65, 75, 88, 91, 92], "server": [0, 8, 10, 14, 16, 20, 27, 29, 30, 31, 33, 34, 35, 55, 56, 57, 59, 88], "server_start_timeout": 26, "servic": [16, 54, 59], "session": [5, 64, 68, 82], "set": [0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 13, 15, 16, 17, 18, 24, 25, 26, 32, 40, 50, 51, 52, 60, 62, 65, 66, 67, 69, 70, 72, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 87, 88, 94], "set_attn_processor": 79, "set_from_opt": 1, "set_if_not_exist": 79, "set_input_shap": 82, "set_rank": 79, "set_rel_attn_t": 78, "set_shap": 82, "setadditionalmodeloutput": [0, 3], "setallottedtimem": 0, "setbackend": 0, "setbadword": 0, "setbatchingtyp": 0, "setbeamsearchdiversityr": 0, "setbeamwidth": 0, "setbeamwidtharrai": 0, "setbitto": 0, "setcachest": 0, "setcachetransceiverconfig": 0, "setclientid": 0, "setcommst": 0, "setcommunicationmod": 0, "setcommunicationtyp": 0, "setcontextfmha": 1, "setcontextphaseparam": [0, 2], "setcopyonpartialreus": 0, "setcrossattentionmask": 0, "setcrosskvcachefract": 0, "setcudagraphcaches": 0, "setcudagraphmod": 0, "setdatatyp": 1, "setdebugconfig": 0, "setdebuginputtensor": 0, "setdebugoutputtensor": 0, "setdebugtensornam": 0, "setdebugtensorsmaxiter": 0, "setdecodingconfig": 0, "setdecodingmod": 0, "setdeviceid": 0, "seteagleconfig": 0, "seteagleinput": 1, "setearlystop": 0, "setembeddingbia": 0, "setenableblockreus": 0, "setenablechunkedcontext": 0, "setenablecontextfmhafp32acc": 0, "setenablepartialreus": 0, "setenabletrtoverlap": 0, "setencodedvocab": 0, "setencoderhiddens": 1, "setencoderinputfeatur": 0, "setencoderinputtokenid": 0, "setencoderoutputlength": 0, "setendid": 0, "seteventbuffermaxs": 0, "setexecutionconfig": 1, "setexplicitdrafttokensinput": 1, "setextendedruntimeperfknobconfig": 0, "setexternaldrafttokensconfig": 0, "setfreegpumemoryfract": 0, "setfrequencypenalti": 0, "setfrom": 0, "setfrominput": 1, "setgathergenerationlogit": 0, "setgemmallreducedtyp": 1, "setgpuweightsperc": [0, 11], "setguideddecodingconfig": 0, "setguideddecodingparam": 0, "sethostcaches": 0, "setinittozero": 1, "setisorchestr": 0, "setiterstatsmaxiter": 0, "setkvcacheconfig": 0, "setkvcacheretentionconfig": 0, "setkvcachetyp": 1, "setlanguageadapteruid": 0, "setlayertyp": 1, "setlengthpenalti": 0, "setlevel": 1, "setlogitsdtyp": 1, "setlogitspostprocessor": 0, "setlogitspostprocessorconfig": 0, "setlogitspostprocessornam": 0, "setlookaheadconfig": 0, "setlookaheaddecodingconfig": 0, "setloraconfig": 0, "setloramodul": 1, "setmanagedweightsmap": 1, "setmanageweightstyp": 1, "setmaxattentionwindowvec": 0, "setmaxbatchs": [0, 1], "setmaxbeamwidth": [0, 1], "setmaxdraftpathlen": 1, "setmaxdrafttoken": 1, "setmaxencoderlen": 1, "setmaxinputlen": 1, "setmaxlorarank": 1, "setmaxnumpath": 1, "setmaxnumtoken": [0, 1], "setmaxpagesperblock": 1, "setmaxpositionembed": 1, "setmaxpromptembeddingtables": 1, "setmaxqueues": 0, "setmaxseqidlemicrosecond": 0, "setmaxsequencelen": 1, "setmaxtoken": 0, "setmedusachoic": 0, "setmem": 1, "setmemorytyp": 1, "setminp": 0, "setmintoken": 0, "setmlphiddens": 1, "setmodelnam": 1, "setmodelvari": 1, "setmropeconfig": 0, "setmultiblockmod": 0, "setmultimodalembed": 0, "setnbcrosskvhead": 1, "setnbkvhead": 1, "setnorepeatngrams": 0, "setnormalizelogprob": 0, "setnumcopystream": 1, "setnumdecodingenginetoken": 1, "setnumkvheadspercrosslay": 1, "setnumkvheadsperlay": 1, "setnumlanguag": 1, "setnumnod": 0, "setnumreturnsequ": 0, "setonboardblock": 0, "setorchestratorconfig": 0, "setorchleadercomm": 0, "setoutputconfig": 0, "setpadid": 0, "setpagedcontextfmha": 1, "setpagewidth": 1, "setparallelconfig": 0, "setparticipantid": 0, "setpath": 1, "setpeftcacheconfig": 0, "setpositionid": 0, "setppreducescatt": 1, "setpresencepenalti": 0, "setprior": 0, "setprocessorbatch": 0, "setprocessormap": 0, "setprompttableoffload": 0, "setprompttuningconfig": 0, "setquantmod": 1, "setrecvpollperiodm": 0, "setrepetitionpenalti": 0, "setrepl": [0, 3], "setrequeststatsmaxiter": 0, "setrequesttyp": [0, 2], "setreturnallgeneratedtoken": 0, "setrnnconfig": 1, "setrotaryembeddingdim": 1, "setsamplingconfig": 0, "setschedulerconfig": 0, "setse": 0, "setsecondaryoffloadminprior": 0, "setsinktokenlength": 0, "setsizeperhead": 1, "setskipcrossattnblock": [0, 1], "setslotsperpag": 1, "setspawnprocess": 0, "setspecdecconfig": 0, "setspeculativedecodingmod": 1, "setspeculativedecodingmodul": 1, "setstoptokenid": 0, "setstopword": 0, "setstream": 0, "settemperatur": 0, "setter": [0, 6], "settokenizerstr": 0, "settokensperblock": 1, "settopk": 0, "settopp": 0, "settoppdecai": 0, "settoppmin": 0, "settoppresetid": 0, "settotalnumpag": 1, "setup": [1, 5, 25, 40, 50, 51, 52, 62, 71, 72, 82, 83, 84, 88], "setup_fake_prompt": 82, "setup_fake_prompts_qwen2vl": 82, "setup_fake_prompts_vila": 82, "setup_input": 82, "setupeagl": 1, "setupexplicitdrafttoken": 1, "setuplookahead": 1, "setupspeculativedecod": 1, "setuptool": [61, 62], "setusecrossattent": 1, "setusegpudirectstorag": 0, "setusemrop": 1, "setusepositionembed": 1, "setuseshapeinfer": 1, "setusetokentypeembed": 1, "setworkerexecutablepath": 0, "setzero": [0, 1], "seve": 65, "sever": [0, 1, 2, 5, 7, 10, 13, 32, 72, 73, 74, 75, 77, 84, 87, 92], "sft": 53, "sh": [14, 27, 88, 89], "shah": 88, "shaken": 46, "shall": [17, 84], "shape": [0, 1, 5, 7, 9, 13, 14, 24, 65, 75, 77, 79, 82, 84, 85, 87, 88, 92, 93], "shape_cast_dtyp": 77, "shapeequ": 1, "shard": [15, 24, 59, 68, 73, 77, 78], "shard_map": 15, "sharding_along_vocab": 65, "sharding_dim": [77, 78], "share": [1, 2, 3, 5, 7, 8, 9, 10, 17, 18, 23, 24, 25, 60, 71, 72, 77, 78, 88], "share_embed": 88, "share_weight": 78, "shared_embedding_t": 88, "shared_ptr": [0, 1], "sharedconstptr": 1, "sharedptr": 1, "shelf": 88, "sherlock113": 88, "ship": [17, 46], "shm": 87, "short": [5, 68, 72, 74], "short_mscal": [77, 78], "shorter": [5, 69], "shot": 88, "should": [0, 1, 2, 3, 7, 8, 9, 17, 18, 32, 38, 40, 41, 42, 44, 45, 47, 48, 50, 51, 52, 53, 60, 65, 68, 69, 70, 71, 75, 76, 77, 78, 80, 82, 84, 88, 90, 92, 93, 94], "should_stop": 82, "shouldus": 5, "show": [2, 3, 14, 20, 24, 26, 36, 69, 70, 74, 75, 83, 84, 86, 89], "showcas": [72, 75, 83], "shown": [21, 26, 60, 64, 68, 70, 72, 74, 75, 77], "shrunk": 77, "shuffl": 77, "shut": 2, "shutdown": [0, 54, 64, 65], "si": 5, "sibl": 14, "side": [3, 77], "side_stream_id": 77, "sidestreamidtyp": 77, "sigh": 53, "sigmoid": [14, 77], "signal": 0, "signatur": [7, 47, 77], "signifi": 74, "signific": [3, 5, 21, 53, 71, 72, 74, 75], "significantli": [23, 24, 70, 71, 72, 74, 75, 84, 92], "silu": [14, 77, 78], "similar": [0, 5, 6, 7, 10, 18, 19, 21, 32, 44, 48, 67, 68, 76, 77, 91, 94], "similarli": 10, "simpl": [2, 7, 10, 14, 36, 47, 60, 64, 66, 69, 83, 89], "simpler": 10, "simpleschedul": 94, "simplest": 77, "simpli": [5, 10, 66, 68, 69, 74, 83, 87, 90], "simplic": 17, "simplifi": [5, 17, 68, 74, 77, 88], "simultan": [10, 74], "sin": [0, 77, 78], "sinc": [0, 1, 4, 5, 7, 8, 10, 11, 17, 18, 27, 32, 60, 68, 70, 71, 72, 74, 75, 77, 79, 84, 91, 93, 94], "sinco": 78, "singl": [0, 1, 2, 3, 4, 5, 6, 10, 12, 14, 17, 18, 21, 22, 24, 25, 30, 47, 56, 64, 65, 67, 68, 72, 75, 77, 79, 83, 84, 85, 88, 90, 91, 92, 93], "singleton": [7, 77], "sink": [0, 1, 5, 65, 82], "sink_token_len": 82, "sink_token_length": [5, 65, 82], "sinktokenlength": [0, 1], "sinusoid": 78, "sit": [17, 53], "situaiton": 69, "situat": [10, 53, 59, 70, 74], "size": [0, 1, 2, 5, 6, 8, 9, 10, 11, 18, 20, 21, 23, 24, 25, 26, 32, 47, 50, 51, 52, 59, 65, 67, 68, 69, 70, 71, 72, 73, 75, 77, 78, 79, 82, 87, 88, 92, 94], "size_t": [0, 1], "size_typ": [0, 1], "sizeof": 1, "sizeperhead": [0, 1], "sizetype32": [0, 1], "sizetype64": 1, "skip": [0, 1, 7, 15, 18, 28, 54, 60, 65, 77, 94], "skip_attn": [77, 78], "skip_cross_attn_block": [79, 82], "skip_cross_kv": [78, 82], "skip_encod": 82, "skip_special_token": [65, 88], "skip_tokenizer_init": [32, 65], "skipcrossattnblock": [0, 1], "sku": [70, 72, 74, 75], "skywork": [85, 86, 88], "sleep": 28, "slice": [1, 4, 15, 77, 88], "slice_shap": 15, "sliceinputtyp": 77, "slicen": 1, "slide": [59, 76, 77, 82, 88], "slider": [18, 24, 68], "sliding_window": 79, "sliding_window_caus": 77, "sliding_window_pattern": 79, "slight": [18, 72, 74, 75], "slightli": [0, 2, 9, 26, 72, 75], "slope": [5, 77], "slot": [0, 1, 88], "slot_map": [77, 79], "slotidx": 1, "slotsperpag": 1, "slow": [3, 8, 65, 66, 71], "slower": [17, 71], "slowest": 5, "slurm": [14, 50, 51, 52, 62, 64, 87, 88], "sm": [86, 88], "sm120": 88, "sm80": [86, 88], "sm86": [86, 88], "sm89": [86, 88], "sm90": [86, 88], "small": [5, 8, 10, 14, 23, 24, 70, 72, 74, 75, 77, 84, 87, 88], "smaller": [1, 10, 18, 25, 67, 68, 71, 74, 75, 76, 77, 84, 88], "smallest": [0, 1, 77], "smart": 77, "smaug": [86, 88], "smi": [18, 24, 68, 84], "smile": 53, "smith": [38, 41, 42, 43, 44, 45, 47, 54], "smooth": [17, 65, 88], "smoother": 18, "smoothquant": [7, 23, 59, 88], "smoothquant_v": 65, "snapshot": 68, "snapshot_download": 53, "snip": 68, "snippet": [68, 88, 94], "snshrivas10": 53, "so": [0, 2, 3, 5, 7, 9, 10, 16, 17, 18, 24, 27, 32, 44, 60, 65, 68, 71, 72, 74, 75, 76, 77, 78, 79, 84, 86, 88, 90, 93], "socketst": 0, "softmax": [5, 14, 77, 92], "softplu": 77, "softwar": [3, 5, 14, 59, 66, 88], "solid": 73, "solut": [16, 64, 87, 91], "some": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 17, 18, 24, 25, 26, 28, 53, 62, 65, 66, 69, 72, 73, 75, 76, 77, 80, 83, 84, 87, 88, 90, 91, 94], "someth": [14, 32, 46], "sometim": 68, "song": 68, "soon": [0, 19, 20, 21, 22, 23, 32], "sophist": 47, "sora": [30, 56], "sort": [0, 1, 3, 6, 77], "sota": 88, "sourc": [12, 13, 15, 17, 18, 19, 22, 24, 25, 26, 29, 30, 31, 33, 34, 35, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59, 65, 66, 77, 78, 79, 80, 81, 82, 88], "source_root": [50, 51, 52], "sourcetaskvalu": 1, "soyer": [12, 14, 87], "space": [9, 60, 65, 74, 84, 93], "spaces_between_special_token": [65, 88], "span": [17, 24], "spars": [10, 77, 88], "sparsiti": 25, "spatial_norm_dim": 78, "spawn": [36, 45, 61, 62, 64, 70, 83, 87], "spawnprocess": [0, 2], "spec": 25, "spec_decoding_generation_length": [77, 78, 79], "spec_decoding_is_generation_length_vari": [77, 78, 79], "spec_decoding_max_generation_length": [77, 78], "spec_decoding_packed_mask": [77, 78, 79], "spec_decoding_param": [78, 79], "spec_decoding_position_offset": [77, 78, 79], "spec_decoding_us": [77, 78], "specdecconfig": 0, "specdecfastlogitsinfo": 0, "specdecodinggenerationlength": 1, "specdecodinggenerationlengthshost": 1, "specdecodingpackedmask": 1, "specdecodingparam": 78, "specdecodingpositionoffset": 1, "special": [2, 5, 9, 14, 15, 19, 25, 65, 88], "specif": [0, 1, 4, 6, 7, 9, 10, 13, 17, 20, 23, 24, 26, 47, 60, 62, 68, 71, 72, 75, 77, 83, 88, 90, 91], "specifi": [0, 1, 2, 3, 5, 6, 7, 9, 10, 15, 17, 18, 25, 26, 32, 39, 40, 47, 49, 53, 54, 60, 64, 65, 67, 68, 69, 71, 72, 74, 76, 77, 79, 80, 82, 83, 84, 87, 88, 92], "specul": [0, 1, 3, 24, 59, 63, 65, 68, 70, 77, 88], "speculative_config": [18, 24, 39, 48, 49, 65], "speculative_decod": 88, "speculative_decoding_draft_tokens_extern": 79, "speculative_decoding_mod": [25, 65, 68], "speculative_model": [39, 49, 65], "speculativedecod": 0, "speculativedecodingconfig": 0, "speculativedecodingfastlogitsinfo": 0, "speculativedecodingmetr": 0, "speculativedecodingmod": [65, 79, 88], "speculativedecodingmodul": 88, "speculativedecodingoutput": 1, "speed": [14, 20, 24, 25, 68, 69, 75, 88], "speedup": [20, 22, 23, 24], "spent": 0, "split": [1, 4, 5, 9, 14, 68, 71, 72, 77, 84, 88], "split_input_id": 82, "split_prompt_by_imag": 82, "split_siz": 77, "split_size_or_sect": 77, "splittransposecpu": 1, "splittransposecpuinn": 1, "splitwis": 2, "spot": 74, "sq": [23, 85, 88], "sqrt": [5, 77], "squar": [74, 77], "squared_relu": 77, "squeez": [1, 77, 82], "src": [1, 14, 77], "src_seq_len": 77, "srctype": 1, "srun": [14, 26, 50, 51, 52, 62, 87], "sshd": 27, "ssid": 40, "ssm": 77, "ssm_state": 79, "stabil": 24, "stabl": [5, 15, 25, 70, 74, 75, 77, 88], "stack": [15, 24, 60, 77], "stage": [0, 5, 7, 10, 69, 84, 88, 92], "stai": [20, 23, 71, 75], "stand": 14, "standalon": 17, "standard": [10, 14, 16, 19, 69, 77], "starcod": [64, 86, 88], "starcoder1": 85, "starcoder2": [85, 88], "starrickliu": 88, "start": [0, 3, 5, 7, 8, 18, 25, 27, 28, 29, 30, 31, 33, 34, 35, 52, 53, 55, 56, 57, 60, 64, 65, 66, 68, 69, 70, 71, 74, 76, 77, 79, 81, 82, 84, 88], "start_dim": 77, "startup": 87, "stat": [0, 65, 88], "state": [0, 1, 3, 4, 5, 7, 8, 10, 18, 24, 25, 36, 38, 39, 41, 42, 43, 44, 45, 47, 49, 54, 61, 62, 65, 68, 69, 70, 74, 76, 77, 83, 88, 89, 94], "state_dtyp": 82, "state_or_ptr": 77, "state_s": 82, "statement": 64, "stateptr": 0, "states": 1, "static": [0, 1, 3, 10, 25, 65, 77, 78, 79, 82, 88], "static_batch": [65, 76], "static_cast": 85, "staticbatchingstat": 0, "statist": [0, 3, 10, 26, 65, 68, 88], "statu": 87, "std": [0, 1, 3], "stddev": [26, 34, 35], "stdev": [18, 51, 67, 68, 69, 70], "stdit": 88, "stdout": [18, 51, 67, 68, 69, 70], "steadi": 69, "steady_clock": 0, "step": [0, 1, 5, 6, 7, 8, 10, 13, 14, 16, 17, 19, 24, 28, 47, 59, 61, 62, 65, 66, 68, 69, 70, 77, 82, 87, 91, 92, 93, 94], "still": [5, 15, 17, 18, 24, 66, 68, 70, 72, 77, 82, 84, 88], "stop": [0, 1, 3, 6, 7, 10, 65, 68, 74, 82, 83, 88], "stop_reason": [48, 65, 83, 88], "stop_token_id": [3, 65], "stop_words_data": 82, "stop_words_list": 82, "stopping_criteria": 82, "stoppingcriteria": [82, 88], "stoppingcriterialist": 82, "stoptokenid": [0, 3], "stopword": 0, "stopwordslen": 1, "stopwordslist": 1, "stopwordsptr": 1, "storag": [0, 9, 65], "store": [0, 1, 5, 8, 9, 14, 20, 24, 46, 49, 64, 65, 68, 76, 77, 79, 84, 85, 90, 92, 93], "store_tru": 49, "stored_block": 46, "stori": 53, "str": [13, 17, 42, 43, 65, 77, 78, 79, 82], "strategi": [0, 10, 23, 32, 44, 59, 68, 73, 77, 79, 84, 88], "stream": [0, 1, 2, 3, 14, 25, 26, 32, 34, 35, 36, 37, 47, 65, 67, 77, 82, 84, 87, 88], "stream_ptr": 47, "streaming_llm": 88, "streamingllm": [25, 59, 88], "streamlin": [68, 83], "streamptr": [0, 1, 3], "street": 53, "strenum": [65, 81], "strict": 24, "strict_bound": 77, "strict_dtyp": [77, 78], "stricter": 24, "strictli": 68, "stride": [1, 77, 78], "strike": [10, 46], "string": [0, 1, 3, 13, 40, 65, 68, 77, 82], "string_valu": 8, "string_view": 1, "stringptrmap": 1, "stringvec": 0, "strip": [25, 88], "strip_plan": 25, "strongli": 72, "strongly_typ": [65, 88], "struct": [0, 1], "structur": [0, 4, 7, 10, 47, 65, 77, 84, 88], "structural_tag": 65, "struggl": 53, "student": [38, 41, 42, 44, 45, 47], "studi": [70, 72, 73, 75], "style": [5, 10, 24, 88], "sub": [13, 17, 77], "subclass": [1, 17, 47, 90], "subcommad": 68, "subcommand": [69, 88], "subgraph": [7, 77], "subject": [2, 19, 21, 22, 23, 64, 77, 83, 89], "submiss": 68, "submit": [9, 65, 68], "submit_sync": 65, "submodul": [18, 60, 90], "suboptim": 14, "subscript": 77, "subsequ": [2, 8, 9, 10, 70], "subset": [0, 3, 6, 14, 17, 68, 77], "substanti": [8, 10, 24], "subsystem": 88, "subtract": 7, "succe": [84, 88], "succeed": 82, "success": [3, 20, 24, 69], "successfulli": [10, 28, 72], "sudo": [18, 24, 61, 62, 68], "suffer": 24, "suffici": [71, 72], "suggest": [5, 23, 53, 72], "suit": [5, 68, 69], "sum": [1, 7, 12, 77, 93], "sum_of_token": 77, "summar": [5, 10, 11, 12, 13, 21, 23, 68, 69, 76, 84], "summari": [10, 59], "summat": 77, "sunjiabin17": 88, "super": [7, 12, 15, 17, 86, 87, 90, 94], "superchip": 86, "supplementari": 78, "suppli": [9, 16], "support": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 32, 40, 47, 50, 51, 52, 53, 59, 62, 63, 65, 69, 70, 72, 74, 75, 76, 77, 78, 80, 83, 87, 88, 89, 90, 91, 92, 93, 94], "supportsinflightbatch": 1, "suppos": 90, "suprem": [38, 41, 42, 44, 45], "sure": [2, 17, 18, 28, 60, 68, 76, 77, 88], "surpass": 5, "surround": [5, 88], "sweep": [14, 20, 74], "sweet": 74, "swept": 21, "swiglu": [25, 77, 88], "switch": [4, 8, 20, 23, 24, 60, 76, 84, 88], "sxm": [20, 25, 70, 72, 73], "sy": 88, "sync": 82, "synchron": [1, 3, 14, 65, 87, 88], "syntax": [77, 83], "synthet": [18, 26, 34, 35, 68, 69], "synthetic_128_128": 68, "synthetic_2048_2048": 70, "synthetic_2048_2048_1000": 70, "system": [8, 14, 18, 20, 26, 29, 30, 40, 50, 51, 52, 55, 56, 59, 60, 62, 69, 71, 83, 86, 88, 89], "systemat": 24, "t": [0, 1, 5, 10, 14, 17, 24, 26, 27, 32, 46, 50, 51, 52, 62, 65, 67, 68, 71, 74, 75, 77, 79, 82, 87], "t5": [5, 6, 85, 86, 88], "tabl": [0, 6, 8, 20, 23, 25, 68, 69, 77, 78, 82, 86, 87, 88], "tactic": 25, "tag": [0, 27, 60, 65], "tailor": [23, 72, 75], "take": [0, 1, 2, 5, 6, 7, 8, 13, 17, 46, 53, 66, 68, 70, 71, 74, 77, 78, 93], "taken": [15, 19, 20, 77], "talk": 53, "tanh": [77, 78], "target": [0, 15, 18, 25, 32, 59, 60, 68, 75, 76, 88], "target_isl": 68, "target_osl": 68, "targetcach": 1, "targetpageid": 1, "targetprob": 1, "targettaskvalu": 1, "tarot": 53, "task": [0, 1, 8, 9, 10, 12, 13, 42, 43, 50, 51, 52, 65, 68, 78, 82, 85, 88, 93], "task_id": [9, 68], "task_vocab_s": 78, "taskid": [0, 1], "taskidtyp": 1, "tasklayermoduleconfig": 1, "tasklayermoduleconfigbind": 1, "tasklayermoduleconfiglistptr": 1, "taskshost": 1, "taskvalu": 1, "taskvalueptr": 1, "taslid": 1, "tayef": 88, "tconstptr": 1, "tcp": 28, "team": [13, 17, 18, 24, 28, 86, 88], "tech": 88, "technic": 59, "techniqu": [5, 7, 10, 14, 19, 24, 66, 71, 72, 73, 76, 85, 88], "technologi": [24, 38, 41, 42, 44, 45, 47], "tekit_2025": 68, "tell": [30, 53, 54, 56, 75, 83], "temb": 78, "temp": 82, "temperatur": [0, 1, 6, 26, 29, 30, 31, 32, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 49, 54, 61, 62, 65, 68, 70, 76, 82, 83, 88], "tempfil": [41, 44], "templat": [0, 1, 14, 15], "tempor": 82, "temporari": 2, "ten": [10, 23], "tend": 76, "tensor": [1, 6, 13, 14, 15, 18, 19, 20, 21, 22, 24, 26, 45, 47, 59, 65, 68, 69, 72, 73, 75, 77, 78, 79, 82, 85, 87, 88, 90, 92], "tensor_dict": 82, "tensor_input": 7, "tensor_parallel_s": [45, 46, 49, 50, 51, 52, 65, 70, 71, 72, 75, 76], "tensor_shap": 15, "tensorconstptr": 1, "tensorinfo": 82, "tensorloc": 77, "tensormap": 1, "tensorparallel": [0, 1, 6], "tensorptr": [0, 1], "tensorrt": [1, 3, 5, 6, 7, 11, 12, 19, 22, 24, 25, 26, 29, 30, 31, 32, 33, 34, 35, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 61, 62, 63, 67, 69, 72, 73, 75, 76, 77, 82, 85, 87, 89, 90, 91, 92, 93, 94], "tensorrt_llm": [0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 14, 15, 17, 18, 26, 27, 28, 32, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 60, 61, 62, 65, 68, 69, 70, 72, 75, 76, 77, 78, 79, 80, 81, 82, 83, 87, 88, 89, 90, 91, 92, 93], "tensorrt_llm_gpt": 14, "tensorrt_llm_rouge1_threshold": 13, "tensorrtllm_backend": [9, 83, 88], "term": [14, 64, 76, 77, 83], "termin": [0, 8, 28, 69, 88], "test": [5, 23, 24, 26, 30, 56, 59, 60, 61, 62, 68, 69, 70, 72, 73, 74, 75, 76, 86, 88, 93], "test_graph_rewrit": 7, "test_trt_llm": [11, 12, 13], "texec": 0, "text": [0, 3, 5, 6, 8, 25, 30, 32, 36, 37, 38, 45, 46, 54, 56, 61, 62, 65, 66, 68, 69, 70, 76, 82, 83, 86, 87, 88, 89], "text_diff": 65, "text_hidden_s": 79, "textattack": 86, "textprompt": 65, "tg_group": 77, "tgt": [14, 77], "tgt_len": [77, 78], "tgt_seq_len": 77, "th": [1, 13, 77], "than": [0, 1, 2, 3, 5, 6, 7, 8, 10, 14, 18, 19, 20, 21, 23, 24, 25, 60, 65, 66, 68, 69, 70, 71, 72, 74, 76, 77, 82, 84, 87, 88, 92], "thank": 88, "thecodewrangl": 88, "thei": [0, 1, 3, 5, 6, 9, 14, 15, 17, 24, 48, 60, 65, 68, 70, 72, 74, 75, 76, 77, 79, 85, 88], "them": [0, 3, 4, 7, 10, 11, 18, 24, 50, 51, 52, 65, 66, 67, 68, 71, 73, 74, 76, 77, 82, 84, 90], "theoret": 84, "theori": 76, "therebi": [2, 76], "therefor": [11, 17, 69, 77, 87, 93], "thermal": 68, "theta": 77, "thi": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 32, 36, 40, 47, 49, 50, 51, 52, 53, 60, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94], "thin": 17, "thing": [6, 28, 38, 41, 42, 44, 45, 47, 74, 75], "think": [24, 46, 73], "third": [3, 88], "those": [3, 5, 6, 13, 14, 16, 18, 24, 25, 26, 67, 69, 70, 75, 77, 78, 85], "though": [17, 74, 84], "thread": [0, 1, 5, 32, 64, 68, 82], "three": [2, 3, 13, 23, 24, 76, 77, 85, 90, 91, 92], "threshold": [0, 24, 77, 82], "throttl": 68, "through": [0, 5, 6, 7, 10, 14, 15, 16, 18, 24, 25, 26, 60, 66, 68, 70, 71, 72, 74, 75, 78, 83, 88], "throughout": [70, 73], "throughput": [0, 3, 5, 19, 20, 21, 51, 59, 67, 72, 74, 75, 76, 88, 92], "throw": [0, 1], "thu": [8, 17, 18, 24, 60, 77, 84], "thumb": [5, 71, 87], "ti": 5, "tiiuae": 68, "time": [0, 1, 2, 3, 5, 8, 9, 10, 11, 14, 18, 21, 23, 24, 25, 38, 41, 42, 43, 44, 45, 53, 59, 60, 65, 66, 67, 68, 69, 70, 72, 73, 74, 76, 77, 82, 87, 88, 93], "time_embed_dim": 78, "time_encod": 82, "time_point": 0, "timedelta": 65, "timedout": 0, "timelin": 13, "timeout": [0, 26, 32, 65, 88], "timepoint": 0, "timestamp": 0, "timestep": [78, 79], "timestepembed": 78, "timingmetr": 0, "tini": 53, "tinyllama": [26, 29, 31, 34, 36, 38, 40, 41, 42, 43, 44, 45, 46, 47, 48, 53, 54, 55, 57, 61, 62, 64, 83, 89], "tip": 59, "titl": 40, "tle": 11, "tllm_checkpoint_16gpu_tp8_pp2": 71, "tllm_ckpt_dir": 12, "tllm_engine_dir": 12, "tllm_kei": [15, 78], "tllm_llmapi_build_cach": 88, "tllm_llmapi_enable_nvtx": 67, "tllm_log_level": 87, "tllm_nvtx_debug": 67, "tllm_override_layer_num": 88, "tllm_profile_record_gc": 67, "tllm_profile_start_stop": 67, "tllm_to_externel_key_dict": 15, "tllm_torch_profile_trac": 67, "tllm_trace_model_forward": 88, "tllm_weight": 15, "tllmruntim": [1, 6, 87], "tlntin": 88, "tmp": [9, 11, 51, 67, 68, 71], "tmp9so41y3r": 68, "tmpowsrb_f4": 68, "tmpxhdvasex": 68, "to_arrai": 77, "to_dict": [65, 79], "to_json_fil": 79, "to_layer_quant_config": 79, "to_legacy_set": 80, "to_str": [0, 1, 3], "to_trt": 79, "tobyt": 1, "todo": [1, 49, 77], "togeth": [3, 5, 6, 9, 14, 16, 19, 24, 25, 82, 85, 88], "toggl": 67, "toi": 74, "toitensor": 0, "tojsonstr": 0, "tok": [19, 21, 22, 75], "token": [0, 1, 2, 3, 4, 5, 6, 8, 10, 14, 18, 19, 22, 23, 24, 25, 26, 27, 34, 35, 40, 46, 47, 51, 59, 65, 67, 68, 69, 70, 72, 73, 75, 77, 78, 79, 82, 83, 84, 85, 88, 90, 91, 92], "token_drop": 78, "token_end": 65, "token_extra_id": 46, "token_id": [32, 46, 47, 48, 65], "token_ids_diff": 65, "token_range_retention_config": 65, "token_start": 65, "token_type_id": [79, 82], "tokenend": 0, "tokenextraid": 1, "tokenextraidtyp": 1, "tokenid": 1, "tokenidtyp": [0, 1], "tokenization_utils_bas": 65, "tokenizer_dir": [12, 14, 83, 87], "tokenizer_image_token": 82, "tokenizer_max_seq_length": [65, 72, 79, 81], "tokenizer_mod": 65, "tokenizer_revis": 65, "tokenizer_str": [0, 3], "tokenizerbas": 65, "tokenizerstr": [0, 3], "tokenlogprob": 65, "tokenrangeretentionconfig": [0, 65], "tokenrangeretentionprior": 0, "tokens_per_block": [8, 25, 82, 88, 93], "tokensperblock": [0, 1, 6], "tokensperstep": 1, "tokensprompt": 65, "tokenstart": 0, "tokyo": [30, 56], "toler": 23, "tomodulenam": 1, "tomoduletyp": 1, "tonylek": 88, "too": [3, 5, 18, 70, 74, 87], "took": 70, "tool": [2, 13, 18, 59, 64, 68, 88], "tool_cal": 83, "toolkit": [16, 17, 23, 24, 62, 91], "top": [0, 5, 6, 10, 14, 16, 65, 77, 88], "top1": 24, "top_k": [6, 65, 82, 88], "top_p": [6, 36, 38, 39, 41, 42, 43, 44, 45, 46, 47, 49, 54, 61, 62, 65, 70, 76, 82, 83], "top_p_decai": [65, 82], "top_p_min": [65, 82], "top_p_reset_id": [65, 82], "topenkoff": 88, "topic": 75, "topk": [0, 1, 4, 6, 10, 24, 77, 88], "topk_logit": 3, "topklastdim": 77, "topklogit": 3, "topkmedusahead": 1, "topktopp": [0, 6], "topmodelmixin": [17, 79], "topn": 24, "topp": [0, 1, 6, 88], "toppdecai": [0, 1, 6], "toppmin": [0, 1, 6, 65], "toppresetid": [0, 1, 6], "torch": [5, 15, 47, 54, 60, 61, 62, 65, 68, 77, 82, 87, 90], "torchaudio": [61, 62], "torchvis": [61, 62], "tostr": [0, 1], "total": [0, 1, 4, 5, 6, 10, 13, 15, 18, 25, 26, 68, 69, 70, 71, 84, 93], "total_lat": [19, 22], "total_token": 83, "totalaccepteddrafttoken": 0, "totaldrafttoken": 0, "totalgentoken": 1, "totalnumpag": 1, "totensor": 0, "touch": [27, 90], "tp": [0, 2, 4, 6, 9, 14, 18, 19, 20, 21, 22, 23, 24, 26, 51, 68, 69, 70, 77, 88], "tp1": [19, 20, 21], "tp2": 68, "tp4": 24, "tp4ep2": 24, "tp8": [21, 24], "tp8ep2": 24, "tp_1_pp_1": 68, "tp_dim": [15, 78], "tp_group": [77, 78], "tp_rank": [15, 77, 78], "tp_size": [4, 9, 13, 14, 15, 17, 26, 33, 50, 52, 68, 69, 71, 77, 78, 81, 88], "tp_split_dim": 78, "tpot": [22, 69], "tprank": 1, "tpsize": 1, "tqdm": [15, 65, 88], "trace": [17, 67, 87], "track": [5, 65, 77], "trade": 8, "tradeoff": [23, 24, 72], "tradit": 0, "train": [10, 12, 13, 14, 16, 17, 20, 23, 68, 77, 87, 90], "trait": 88, "transa": 77, "transb": 77, "transceiv": [0, 65], "transfer": [0, 2, 14, 47, 65, 88], "transform": [0, 4, 5, 10, 12, 13, 14, 15, 25, 26, 32, 65, 79, 83, 84, 86, 87, 88, 90, 91, 93], "translat": [76, 88], "transmiss": 2, "transmit": 2, "transpos": [1, 13, 77], "transposit": 77, "travers": 14, "treat": [5, 24, 77], "tree": [0, 68, 82, 87, 93], "tri": 94, "tricki": 79, "trigger": [5, 7, 14, 25, 32, 54, 64], "trim": 1, "trimpool": 1, "triton": [8, 9, 10, 14, 16, 59, 66, 88], "tritonserv": 88, "trivial": 14, "troubleshoot": [59, 88], "trt": [0, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, 20, 27, 41, 44, 68, 74, 77, 79, 81, 82, 84, 87, 88, 92], "trt_ckpt": [9, 11, 13, 87], "trt_engin": [9, 11, 13, 87], "trt_root": 18, "trt_tensor": [14, 77], "trtdatatyp": 1, "trtgptmodel": 84, "trtgptmodeloptionalparam": 88, "trtgptmodelv1": 88, "trtllm": [8, 9, 11, 12, 13, 14, 17, 18, 29, 30, 31, 32, 33, 34, 35, 36, 37, 50, 55, 56, 57, 59, 64, 65, 68, 69, 72, 73, 74, 75, 84, 87, 88], "trtllm_dg_jit_use_nvcc": 18, "trtllm_disable_kv_cache_transfer_overlap": 2, "trtllm_disable_unified_convert": 15, "trtllm_enable_kvcache_receive_parallel": 2, "trtllm_enable_mmha_multi_block_debug": 68, "trtllm_enable_pdl": [18, 24, 68], "trtllm_force_xqa": 5, "trtllm_kvcache_send_max_concurrency_num": 2, "trtllm_kvcache_transfer_buffer_s": 2, "trtllm_kvcache_transfer_use_async_buff": 2, "trtllm_mmha_blocks_per_sequ": 68, "trtllm_mmha_kernel_block_s": 68, "trtllm_model": 15, "trtllm_modules_to_hf_modul": 82, "trtllm_parallel_cache_send": 2, "trtllm_pdl_overlap_ratio": 68, "trtllm_precompiled_loc": 60, "trtllm_prefetch_ratio": 68, "trtllm_request_kv_cache_concurr": 2, "trtllm_serv": 26, "trtllm_try_zcopy_for_kvcache_transf": 2, "trtllm_use_mpi_kvcach": 2, "trtllm_use_precompil": 60, "trtllm_use_ucx_kvcach": 2, "trtllmattent": 92, "trtlmmdatatyp": 0, "true": [0, 1, 3, 6, 7, 8, 10, 13, 18, 24, 26, 32, 38, 39, 43, 44, 46, 47, 48, 49, 51, 53, 65, 67, 68, 69, 72, 75, 77, 78, 79, 80, 82, 84, 87, 88], "true_output_valu": 77, "true_valu": 77, "truncat": [65, 88], "truncate_prompt_token": [65, 88], "trust": 65, "trust_remote_cod": [26, 65, 88], "try": [0, 1, 3, 12, 17, 48, 53, 64, 69, 72, 74, 75, 76, 83, 84, 87, 89], "tsuji": 68, "ttensor": 1, "ttft": [69, 72, 74, 75, 76, 88], "ttim": 88, "ttl": 24, "tunabl": 73, "tune": [0, 2, 3, 10, 20, 23, 24, 25, 59, 65, 68, 69, 72, 75, 78, 79, 82, 83, 84, 88], "tuner": 0, "tupl": [0, 1, 77, 78, 82, 94], "turn": [5, 6, 8, 10, 60, 72, 82, 84, 88], "tushar": 88, "tweak": 76, "twice": 14, "two": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 17, 20, 24, 25, 26, 30, 56, 60, 64, 68, 70, 72, 74, 76, 77, 78, 80, 88, 91, 93, 94], "twofold": 10, "twoshot": 77, "txt": [17, 18, 51, 62, 67, 68, 70, 83, 88], "type": [1, 2, 3, 5, 6, 7, 9, 13, 14, 20, 23, 25, 26, 29, 30, 31, 34, 35, 40, 46, 47, 49, 56, 65, 68, 72, 75, 77, 79, 81, 82, 83, 85, 86, 87, 88, 90, 91, 92, 93], "typedef": [0, 1], "typenam": [0, 1, 14], "typetrait": 0, "typic": [0, 2, 7, 12, 14, 17, 23, 26, 62, 64, 71, 72, 75, 76, 80, 82, 84, 88, 90], "typo": 88, "u": [1, 7, 27, 38, 41, 42, 43, 44, 45, 54, 68, 69, 88], "ub": 77, "ub_oneshot": 68, "ub_tp_siz": 68, "ubuntu": [61, 62, 88, 89], "uc_handl": 1, "uc_ptr": 1, "uc_va": 1, "ucx": [2, 88], "ucx_cuda_copy_async_mem_typ": 2, "ucx_cuda_copy_dmabuf": 2, "ucx_info": 2, "ucx_memtype_cach": 2, "ucx_rndv_frag_mem_typ": 2, "ucx_rndv_pipeline_error_handl": 2, "uid": [0, 82], "uint16_t": 0, "uint32": 1, "uint32_t": [0, 1, 77], "uint64": [1, 8], "uint64_t": [0, 1], "uint8": 1, "uint8_t": [0, 1], "uintptr_t": 1, "uk_bgemm": 24, "ulimit": [60, 87], "ultim": 71, "ulyss": 88, "unabl": [62, 74], "unaccept": 72, "unari": 77, "unaryoper": 77, "unbind": 77, "uncas": 86, "uncertainti": 10, "unchang": [10, 75, 77], "uncommon": 14, "undefin": 77, "under": [0, 23, 25, 60, 64, 68, 69, 87, 88], "underli": [0, 1, 7, 10], "underlying_type_t": 1, "underlyingtyp": [0, 1], "underscor": 72, "understand": [59, 60, 67], "understood": 74, "underutil": 10, "uneven": 88, "unevenli": 24, "unexpect": [87, 88], "unfinish": 0, "unfus": 77, "unfuse_qkv_project": 79, "ungath": 1, "unguid": 40, "unif": 88, "unifi": [13, 17, 23, 88], "uniform": [68, 69, 77], "uniniti": 92, "uninstal": 62, "union": [65, 77], "uniqu": [0, 5, 6, 9, 10, 13, 25, 65, 68], "unique_ptr": [0, 1], "unique_token": 46, "uniqueconstptr": 1, "uniqueptr": 1, "uniquetoken": 1, "unit": [1, 15, 36, 38, 39, 41, 42, 43, 44, 45, 47, 49, 54, 59, 60, 61, 62, 68, 70, 76, 83, 89], "univers": [38, 41, 42, 44, 45, 47], "unless": [0, 32, 65, 71, 75, 76], "unlik": [8, 10], "unlock": 66, "unnecessari": [7, 88, 90, 94], "unneed": [5, 24], "unordered_map": [0, 1, 3], "unpatchifi": 79, "unschedul": 74, "unset": 76, "unsign": 1, "unspecifi": [25, 26, 77], "unsqueez": [1, 77], "unstabl": 17, "unsupport": 88, "until": [0, 1, 3, 6, 8, 10], "untouch": 77, "unus": [0, 68], "up": [0, 5, 6, 9, 10, 18, 20, 21, 24, 25, 40, 68, 74, 75, 88, 93], "up_proj": 15, "upcast": 77, "upcast_attent": 78, "upcast_softmax": 78, "upcom": [23, 93], "updat": [0, 10, 14, 15, 17, 18, 21, 25, 27, 47, 60, 65, 77, 82, 87, 93], "update_from_dict": 65, "update_key_map": 15, "update_kv_cache_typ": 65, "update_output_ids_by_offset": 82, "update_resourc": [91, 93], "update_strategi": 77, "updatenumreturnbeam": 0, "updatespositionid": 1, "upgrad": [61, 62, 83], "uplift": [72, 74, 75], "upon": [10, 69, 75, 87, 88], "upper": [68, 77, 84], "uq_qr_gemm": 24, "url": [26, 30, 34, 35, 56, 60, 61, 62, 88], "us": [0, 1, 2, 3, 4, 5, 6, 8, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 32, 36, 37, 40, 43, 50, 51, 52, 53, 59, 60, 61, 62, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 77, 78, 79, 80, 82, 83, 85, 87, 88, 89, 90, 91, 92, 93, 94], "usabl": 89, "usag": [0, 5, 7, 14, 17, 19, 22, 25, 26, 36, 59, 65, 68, 75, 76, 77, 83, 88, 92], "use_beam_hyp": 82, "use_beam_search": [44, 65, 88], "use_cach": [77, 78, 79], "use_context_fmha_for_gener": 88, "use_cuda_graph": [18, 51, 69], "use_custom_all_reduc": 88, "use_diff_of_squar": 77, "use_dynamic_tre": [39, 65], "use_embedding_shar": 88, "use_fp32_acc": 77, "use_fp8": 78, "use_fp8_context_fmha": [5, 25, 68, 88], "use_fused_mlp": [25, 68, 88], "use_gemm_allreduce_plugin": 82, "use_gpt_attention_plugin": 82, "use_gpu_direct_storag": 82, "use_implicit_relative_attent": 78, "use_kv_cach": [78, 82], "use_logn_sc": 78, "use_lora": 79, "use_lora_plugin": 82, "use_mamba_conv1d_plugin": 82, "use_meta_recip": 65, "use_modelopt_ckpt": 49, "use_modelopt_quant": 17, "use_mrop": 65, "use_one_more_block": 82, "use_paged_context_fmha": [5, 8, 25, 68, 72, 75], "use_parallel_embed": [13, 14, 79], "use_preload": 79, "use_prompt_tun": [79, 88], "use_py_sess": 87, "use_refit": 65, "use_relaxed_acceptance_for_think": [24, 65], "use_runtime_default": 82, "use_safetensors_load": 79, "use_strip_plan": 65, "use_tqdm": 65, "use_variable_beam_width_search": 82, "usebantoken": 0, "usebanword": 0, "usecrossattent": 1, "usedefaultvalu": 1, "usednumblock": 0, "usedraftlogit": 1, "usedraftlogitshost": 1, "usedynamictre": 0, "usedynamictreehost": 1, "useexpliciteosstop": 0, "usefrequencypenalti": 0, "usegemmallreduceplugin": 1, "usegptattentionplugin": [1, 6], "usegpudirectstorag": 0, "uselanguageadapt": 1, "useloraplugin": 1, "usemambaconv1dplugin": 1, "usemaxlengthstop": 0, "useminlen": 0, "useminlength": 0, "useminp": 0, "usemrop": 1, "usenorepeatngrams": 0, "useoccurrencepenalti": 0, "usepackedinput": 1, "usepagedst": 1, "usepenalti": 0, "usepositionembed": 1, "usepresencepenalti": 0, "useprompttun": 1, "user": [0, 2, 3, 5, 6, 7, 8, 9, 14, 15, 16, 17, 18, 22, 23, 24, 26, 27, 29, 30, 39, 40, 44, 47, 48, 49, 55, 56, 60, 64, 65, 67, 68, 69, 74, 75, 76, 77, 79, 83, 84, 85, 87, 88], "user_buff": [25, 72], "userandomacceptancethreshold": 1, "userbuff": 88, "userepetitionpenalti": 0, "userwarn": 62, "useshapeinfer": 1, "usespecdecod": 1, "usestopword": 0, "usetemp": 0, "usetemperatur": 0, "usetokentypeembed": 1, "usevariablebeamwidthsearch": 0, "usr": [13, 18, 26, 29, 30, 31, 33, 34, 35, 62, 68], "usual": [14, 17, 62, 69, 70, 75, 77, 93], "util": [0, 1, 2, 5, 6, 10, 14, 18, 19, 24, 25, 36, 62, 66, 67, 68, 72, 75, 76, 84, 88, 92], "uv_gemm": 24, "uvm": [0, 1], "v": [1, 2, 5, 6, 9, 18, 19, 20, 23, 24, 59, 65, 77, 79, 82, 85, 86, 87, 90, 92], "v0": [9, 19, 20, 21, 22, 66, 68, 69, 86, 88], "v1": [26, 29, 30, 31, 34, 36, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 53, 54, 55, 56, 57, 61, 62, 64, 83, 86, 88, 89], "v10": 88, "v100": 88, "v12": 88, "v2": [23, 85, 88], "v3": [26, 67, 85, 86, 88], "v9": 21, "v_dim": 77, "v_head_dim": [77, 78], "v_proj": [15, 90], "vacat": [38, 41, 42, 44, 45], "valid": [0, 1, 3, 10, 65, 69, 77, 82], "validate_positive_valu": 65, "validatevec": 1, "validationerror": 65, "validmpiconfig": 1, "valu": [0, 1, 2, 5, 6, 8, 9, 11, 13, 14, 15, 18, 19, 20, 25, 26, 32, 54, 65, 68, 70, 72, 74, 76, 77, 79, 80, 81, 82, 84, 85, 87, 88, 92, 93, 94], "valuabl": 24, "value_typ": 0, "valuestatu": 1, "vanilla": [5, 92], "vanillaattent": 92, "var": 77, "vari": [21, 74, 75, 93], "variabl": [0, 1, 6, 15, 18, 21, 24, 50, 51, 52, 59, 62, 65, 67, 68, 87, 88], "variabledraftlength": 1, "varianc": [72, 74, 75, 77], "variant": [0, 3, 5, 17, 19, 64, 77, 83, 88, 92], "varieti": [68, 70, 88], "variou": [5, 10, 16, 68, 72, 74, 88], "varnam": 1, "vartyp": 1, "vboost": [18, 24, 68], "vbw": 88, "ve": [24, 53], "vec": 1, "vec2": 77, "veclogprob": 0, "vectoken": 0, "vectokenextraid": [0, 1], "vector": [0, 1, 3, 5, 6, 9, 77], "vecuniquetoken": [0, 1], "verbatim": 79, "verbos": [25, 26, 68], "veri": [5, 13, 14, 16, 23, 70, 71, 72, 88], "verif": [0, 10, 65], "verifi": [10, 59, 75, 77, 88], "verificationsets": 0, "versa": 8, "version": [0, 1, 2, 5, 6, 13, 15, 17, 18, 24, 26, 32, 60, 62, 68, 70, 77, 83, 87, 88, 89], "vertic": 77, "vertical_strid": 78, "via": [0, 2, 10, 24, 50, 51, 52, 53, 60, 62, 68, 72, 73, 75, 76, 77, 88, 89], "vice": [8, 54], "vicuna": [10, 39, 49], "video": [30, 56, 68, 82, 86, 88], "video_grid_thw": 82, "video_path": 82, "video_preprocess": 82, "video_url": [30, 56], "view": [1, 77, 82], "vila": [30, 56, 85, 86, 88], "vinyl": 68, "violat": 88, "virtual": [0, 1, 78], "vision": [82, 85, 86, 88], "vision_grid_thw": 82, "vision_length": 77, "vision_model_typ": 79, "vision_start": 77, "vision_token_mask": 78, "visit": [10, 24, 88], "visual": [74, 88], "visual_engine_dir": 82, "visual_featur": 82, "visualize_network": [25, 65, 88], "vit": 88, "vital": [7, 23], "vl": [26, 30, 35, 56, 68, 86, 88], "vlm": [86, 88], "vocab": [77, 82], "vocab_embed": [12, 15], "vocab_s": [0, 13, 15, 65, 78, 79, 82, 90], "vocab_size_pad": 82, "vocabs": [1, 6], "vocabsizepad": [0, 1], "vocabulari": [0, 1, 6, 8, 10, 69, 78, 82], "void": [0, 1, 3, 14], "volta": 88, "volum": [1, 60, 68], "volumenonneg": 1, "vonjackustc": 88, "vote": [38, 41, 42, 44, 45], "vulner": 88, "vultureprim": 88, "w": [1, 22, 24, 26, 77, 79, 85, 86, 88], "w1": 77, "w4a": [85, 88], "w4a16": [13, 23, 59, 65, 79], "w4a16_awq": [13, 17, 32, 54, 65], "w4a16_gptq": [13, 65], "w4a8": [23, 88], "w4a8_awq": [13, 17, 65], "w4a8_qserve_per_channel": 65, "w4a8_qserve_per_group": 65, "w4aint8": 88, "w8a": 85, "w8a16": [13, 23, 59, 65, 79], "w8a16_gptq": 65, "w8a8": [20, 23, 59], "w8a8_sq_per_channel": [13, 65], "w8a8_sq_per_channel_per_tensor_plugin": [65, 79], "w8a8_sq_per_channel_per_token_plugin": [65, 79], "w8a8_sq_per_tensor_per_token_plugin": [65, 79], "w8a8_sq_per_tensor_plugin": [65, 79], "wa": [0, 1, 3, 5, 6, 13, 62, 64, 68, 69, 70, 72, 74, 75, 76, 78, 85, 87, 88, 90, 94], "wai": [2, 5, 7, 16, 24, 45, 47, 64, 66, 68, 70, 72, 77, 84, 88], "wait": [0, 1, 3, 17, 32, 65, 66, 68, 77], "walk": [30, 53, 56, 70, 71, 72], "wang1120": 88, "wangkuiyi": 88, "want": [5, 10, 17, 24, 28, 62, 67, 68, 72, 74, 76, 77, 87, 88, 90], "warm": 93, "warmup": [18, 67, 68, 70, 88, 92, 93], "warn": [5, 25, 26, 68, 69, 84], "warp": 88, "watch": 75, "wdkv": 24, "wdq": 24, "we": [1, 2, 4, 6, 7, 9, 10, 11, 13, 17, 18, 22, 23, 24, 26, 27, 28, 38, 41, 42, 44, 45, 53, 54, 60, 62, 64, 67, 68, 69, 70, 71, 72, 74, 75, 77, 82, 83, 87, 88, 90], "weapon": 46, "wear": 46, "web": [16, 28], "weig": 77, "weight": [0, 1, 4, 9, 17, 19, 20, 23, 24, 25, 26, 45, 59, 65, 66, 69, 70, 71, 72, 77, 78, 79, 82, 83, 88], "weight_index": 77, "weight_load": 78, "weight_only_groupwise_quant_matmul": 85, "weight_only_precis": 88, "weight_spars": [25, 65], "weight_stream": [11, 25, 65], "weightonlygroupwisequantmatmulplugin": 85, "weights_dict": 17, "weights_scaling_factor": [13, 15], "weightsinpoint": 1, "weightsoutpoint": 1, "well": [5, 6, 14, 16, 20, 32, 67, 74, 75, 85, 86], "were": [0, 1, 10, 13, 17, 19, 23, 69, 71, 74, 88], "weren": 62, "wget": 87, "what": [2, 3, 30, 53, 56, 59, 60, 67, 68, 70, 72, 74, 75], "whatev": 1, "wheel": [60, 62, 88], "when": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 14, 15, 17, 18, 22, 23, 25, 27, 32, 47, 59, 60, 62, 65, 67, 68, 70, 72, 74, 75, 76, 77, 78, 79, 82, 83, 84, 85, 87, 88, 90, 92, 93], "whenev": 1, "where": [0, 1, 2, 5, 6, 8, 10, 13, 14, 19, 23, 24, 26, 29, 31, 32, 53, 55, 57, 65, 68, 69, 72, 74, 76, 77, 82, 83, 85, 88, 94], "wherea": [0, 13, 74], "whether": [0, 1, 2, 3, 5, 9, 25, 65, 71, 72, 75, 77, 78, 82, 91, 92], "which": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 17, 19, 23, 24, 25, 26, 60, 62, 64, 65, 67, 68, 70, 72, 74, 75, 76, 77, 79, 80, 82, 83, 84, 85, 88, 89, 91, 92, 94], "while": [0, 1, 4, 7, 8, 10, 14, 17, 19, 20, 22, 23, 24, 62, 66, 68, 70, 71, 72, 73, 74, 75, 76, 77, 84, 85, 88, 92], "whisper": [85, 86, 88], "whisperencod": 79, "whl": [18, 60, 61, 62], "who": 64, "whole": [1, 65, 66, 77], "whose": [2, 8, 13, 24, 78], "why": [0, 2, 14, 65, 72, 74, 75, 77, 84], "wide": [0, 4, 70], "width": [0, 1, 5, 6, 35, 65, 78, 82, 84, 88], "window": [0, 1, 10, 25, 59, 65, 68, 77, 82, 88], "window_s": 5, "windows": 0, "wip": 24, "wireless": 40, "wirelessaccesspoint": 40, "wise": [7, 65, 77, 88], "wish": 8, "wit": 46, "with_ssh": 27, "within": [1, 2, 5, 10, 14, 46, 65, 68, 71, 72, 74, 75, 77, 83, 93], "without": [0, 1, 3, 5, 10, 14, 15, 18, 23, 24, 25, 32, 46, 66, 68, 72, 75, 77, 79, 88, 90, 92], "wkr": 24, "wo": [15, 24, 88], "wo_gemm": 24, "won": [62, 71], "word": [0, 3, 5, 65, 77, 82, 88], "word_dict": 82, "word_embed": 15, "word_embeddings_layernorm": 15, "work": [5, 6, 7, 10, 14, 17, 18, 32, 47, 50, 51, 52, 54, 60, 62, 66, 69, 73, 77, 82, 85, 87, 88, 90], "workaround": [15, 18, 88], "workdir": [26, 50, 51, 52, 60], "worker": [14, 25, 26, 65, 68, 84, 88], "workerexecutablepath": 0, "workflow": [5, 6, 12, 13, 18, 32, 59, 64, 69, 70, 72, 73, 77, 83, 87, 88, 89], "workload": [4, 14, 25, 67, 68, 70, 72, 73, 74, 75], "workspac": [1, 25, 26, 65, 68, 77, 84, 88], "workstat": 20, "world": [0, 2, 7, 18, 25, 50, 51, 52, 65, 66, 68, 70, 71, 72, 77], "world_config": 82, "world_siz": [13, 17, 77, 88], "worldconfig": [0, 6, 82], "worldsiz": 1, "wors": [10, 25, 72], "worst": [74, 75], "worth": [5, 72, 75], "would": [0, 7, 10, 68, 70, 72, 74, 76, 77, 90], "wouldn": 46, "wpa2": 40, "wqr": 24, "wrap": [0, 1, 14, 25, 64, 70, 77, 80, 82, 88], "wrapper": [1, 7, 17, 92], "write": [1, 8, 15, 24, 25, 59, 77, 87], "written": [14, 68, 77], "wrong": [10, 46, 88], "wsl": 88, "wuk": 24, "wuq": 24, "wuv": 24, "www": 88, "x": [0, 1, 3, 6, 9, 11, 26, 68, 77, 78, 79, 83, 85, 88], "x86": 8, "x86_64": 86, "xcomposer2": 88, "xgrammar": [0, 3, 40, 88], "xl": 88, "xml": 3, "xor": 77, "xqa": 88, "xxx": [15, 17, 87], "xxx_plugin": 80, "xy": 77, "y": [2, 3, 18, 22, 27, 60, 61, 62, 68, 77, 79, 85], "y_bia": 77, "yaml": [26, 68, 69], "yarn": 77, "ye": [2, 77, 84], "yeah": 53, "yelp": 86, "yen": 68, "yet": [0, 6, 17, 20, 24, 77, 94], "yield": [8, 32, 72, 74], "yiyixu": [30, 56], "yml": [18, 26, 33, 68, 69], "york": [26, 29, 31, 55, 57, 83], "you": [3, 4, 5, 6, 7, 8, 9, 10, 13, 14, 16, 17, 18, 23, 24, 25, 26, 27, 28, 29, 30, 32, 40, 41, 44, 47, 50, 51, 52, 53, 54, 55, 56, 59, 60, 62, 64, 65, 68, 69, 71, 72, 73, 74, 75, 76, 77, 82, 83, 84, 87, 88, 89, 90, 92], "your": [8, 9, 10, 16, 17, 18, 23, 25, 27, 28, 32, 53, 60, 62, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 83, 87, 90, 92, 93], "your_data_path": 18, "your_dockerhub_usernam": [27, 28], "your_model_path": 18, "your_public_kei": 28, "your_work_path": 18, "yourself": 89, "yuhuili": 39, "yyi": 87, "z": 77, "zars19": 88, "zero": [0, 1, 3, 15, 64, 65, 77, 78, 85, 87], "zero_is_placehold": 77, "zip": 47, "zjli2013": 88, "zoo": 88}, "titles": ["Executor", "Runtime", "Disaggregated-Service (experimental)", "Executor API", "Expert Parallelism in TensorRT-LLM", "Multi-Head, Multi-Query, and Group-Query Attention", "C++ GPT Runtime", "Graph Rewriting Module", "KV cache reuse", "Run gpt-2b + LoRA using Executor / cpp runtime", "Speculative Sampling", "Running With Weight Streaming to Reduce GPU Memory Consumption", "Adding a Model", "TensorRT-LLM Checkpoint", "Model Definition", "TensorRT-LLM Model Weights Loader", "TensorRT-LLM Architecture", "TensorRT-LLM Build Workflow", "How to get best performance on DeepSeek-R1 in TensorRT-LLM", "Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100", "H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token", "H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM", "New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget", "Speed up inference with SOTA quantization techniques in TRT-LLM", "Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs", "trtllm-build", "trtllm-serve", "Build the TensorRT-LLM Docker Image", "Develop TensorRT-LLM on Runpod", "Curl Chat Client", "Curl Chat Client For Multimodal", "Curl Completion Client", "LLM Common Customizations", "Deepseek R1 Reasoning Parser", "Genai Perf Client", "Genai Perf Client For Multimodal", "LLM Examples Introduction", "LLM Examples", "Automatic Parallelism with LLM", "Generate Text Using Eagle Decoding", "Generate text with guided decoding", "Generate text", "Generate Text Asynchronously", "Generate Text in Streaming", "Generate text with customization", "Distributed LLM Generation", "Get KV Cache Events", "Control generated text using logits processor", "Generate Text Using Lookahead Decoding", "Generate Text Using Medusa Decoding", "Llm Mgmn Llm Distributed", "Llm Mgmn Trtllm Bench", "Llm Mgmn Trtllm Serve", "Generate text with multiple LoRA adapters", "Generation with Quantization", "OpenAI Chat Client", "OpenAI Chat Client", "OpenAI Completion Client", "Online Serving Examples", "Welcome to TensorRT-LLM\u2019s Documentation!", "Building from Source Code on Linux", "Installing on Grace Hopper", "Installing on Linux", "Key Features", "API Introduction", "API Reference", "Overview", "Performance Analysis", "TensorRT-LLM Benchmarking", "Overview", "Benchmarking Default Performance", "Deciding Model Sharding Strategy", "FP8 Quantization", "Performance Tuning Guide", "Tuning Max Batch Size and Max Num Tokens", "Useful Build-Time Flags", "Useful Runtime Options", "Functionals", "Layers", "Models", "Plugin", "Quantization", "Runtime", "Quick Start Guide", "Memory Usage of TensorRT-LLM", "Numerical Precision", "Support Matrix", "Troubleshooting", "Release Notes", "PyTorch Backend", "Adding a New Model in PyTorch Backend", "Architecture Ovewiew", "Attention", "KV Cache Manager", "Scheduler"], "titleterms": {"": [5, 20, 23, 59], "0": 88, "000": [20, 21], "1": [12, 14, 18, 60, 69, 84, 88], "10": [20, 88], "100m": 20, "11": 88, "12": [21, 88], "13": 88, "13b": 21, "14": 88, "15": 88, "16": 88, "17": 88, "18": 88, "180b": 19, "19": 88, "2": [12, 18, 22, 60, 84, 88], "2b": 9, "3": [12, 14, 18, 68, 69, 84, 86], "4": [12, 18, 20], "405b": [14, 69], "4x": 22, "5": 18, "6": [18, 19], "6x": 20, "7": 88, "70b": [14, 19, 22, 68, 69], "7x": 19, "8": 88, "8b": 69, "9": 88, "As": 3, "For": [30, 35], "In": [3, 5, 66], "Not": 84, "One": [24, 60], "The": [3, 85], "To": 70, "With": [11, 66], "a100": [19, 20], "about": [10, 26, 66, 71], "accept": 24, "access": 27, "account": 28, "accuraci": 23, "achiev": [20, 21], "acknowledg": 24, "activ": [78, 84], "ad": [12, 90], "adapt": 53, "addit": 3, "advanc": 59, "alibi": 5, "analysi": 67, "announc": 88, "api": [3, 7, 11, 17, 26, 36, 64, 65, 70, 83, 88, 91], "arbitrari": 3, "architectur": [16, 24, 59, 91], "argument": 25, "asynchron": 42, "asyncio": 32, "attent": [5, 13, 24, 66, 74, 75, 76, 78, 92], "attentionbackend": 92, "attentionmetadata": 92, "auto": 25, "automat": 38, "autoregress": 24, "avoid": 70, "awq": [13, 19, 85], "b200": [18, 24], "backend": [24, 86, 89, 90, 92], "background": 24, "balanc": 24, "base": 32, "baselin": 72, "batch": [3, 5, 66, 74], "beam": [3, 5], "befor": [68, 70], "begin": 70, "behavior": 68, "bench": [51, 67, 70], "benchmark": [2, 18, 23, 26, 68, 69, 70], "best": [18, 23], "bf16": 85, "bia": 5, "bind": [3, 14, 60], "blackwel": 85, "boost": 68, "boundari": 24, "budget": 22, "buffer": [5, 72, 84], "buffermanag": 1, "build": [13, 17, 18, 25, 27, 28, 32, 60, 68, 70, 75], "c": [3, 6, 60, 84], "cach": [5, 8, 13, 46, 72, 76, 84, 93], "cachecommun": 0, "can": [8, 66], "capac": 76, "case": 74, "cast": 78, "caveat": 68, "chang": [11, 74, 88], "chat": [26, 29, 30, 55, 56], "checkpoint": 13, "choos": 23, "chunk": [5, 18, 74, 76], "class": 3, "classic": 7, "cli": [17, 70], "client": [29, 30, 31, 34, 35, 55, 56, 57], "clock": [18, 68], "close": [19, 22], "code": 60, "collect": 67, "combin": 18, "come": 23, "command": 69, "common": [1, 32, 66], "commun": [24, 71], "compil": [14, 18, 60, 83], "complet": [26, 31, 57], "compon": [6, 89], "conclus": [72, 74, 75], "config": [13, 25], "configur": [3, 6, 9, 24, 28, 32, 72, 75, 90], "connect": 28, "consumpt": 11, "contain": [18, 27, 60], "content": [18, 24, 73, 90], "context": [3, 5, 18, 74, 75, 76], "contigu": 5, "control": [3, 47], "conv": 78, "convers": [12, 17], "coordin": 67, "core": 90, "cpp": 9, "creat": [28, 60], "cross": 5, "cuda": 24, "cudaev": 1, "cudastream": 1, "curl": [29, 30, 31], "custom": [15, 32, 44, 93, 94], "cutlass": 24, "cyclic": 5, "dataset": [18, 68, 69, 70], "datatransceiverst": 0, "debug": [2, 67, 87], "decid": 71, "decod": [3, 10, 25, 39, 40, 48, 49, 84, 91], "decoderst": 1, "decodinginput": 1, "decodingoutput": 1, "decor": 7, "deepseek": [18, 24, 33], "default": [18, 24, 68, 70], "definit": [14, 83, 90], "dens": 24, "depend": 24, "deploi": 83, "dequant": 85, "descript": 67, "detail": [9, 85], "develop": [28, 89], "diagram": 24, "differ": 3, "disabl": 32, "disaggreg": [2, 26], "disaggregated_mpi_work": 26, "disaggserverutil": 0, "distribut": [45, 50], "do": 66, "docker": [27, 28, 60], "dockerhub": [27, 28], "document": [59, 88], "dora": 9, "download": 18, "dq": 85, "draft": 10, "e2": 87, "eagl": [10, 39], "eaglebuff": 1, "eaglemodul": 1, "embed": [5, 78], "enabl": [4, 8, 18, 27, 67, 72, 75], "endpoint": 26, "engin": [13, 14, 64, 68, 70, 83, 91], "enhanc": 88, "environ": 2, "error": 87, "etp": 24, "evalu": 13, "event": 46, "everyth": 24, "exampl": [2, 3, 9, 13, 14, 15, 36, 37, 58, 67, 68], "except": 84, "execut": 87, "executor": [0, 3, 9], "expect": [8, 18], "experiment": 2, "expert": [4, 24], "explicitdrafttokensbuff": 1, "explor": 18, "face": 64, "factor": [5, 13], "falcon": 19, "faq": [2, 84], "faster": 19, "featur": [18, 63, 67, 88], "file": 60, "first": 20, "fix": 88, "flag": [75, 85], "flayerinfo": 7, "flight": [3, 5, 66], "flow": 68, "fmha": 5, "format": [9, 18], "fp16": 85, "fp32": 85, "fp4": 69, "fp8": [5, 13, 20, 66, 69, 72, 85], "fraction": 76, "free": 76, "from": 60, "full": 60, "fulli": 15, "function": [7, 15, 77], "fuse_a_gemm": 24, "fusion": [14, 24, 72, 75], "futur": [24, 32], "garbag": 67, "gate": 72, "gc": 67, "gemm": [24, 72, 75], "genai": [34, 35], "gener": [2, 5, 32, 39, 40, 41, 42, 43, 44, 45, 47, 48, 49, 53, 54], "get": [18, 46, 59], "gil": 67, "gpt": [6, 9], "gptdecod": 1, "gptdecoderbatch": 1, "gptjsonconfig": 1, "gptq": 85, "gpu": [11, 14, 18, 19, 24, 66, 68, 76, 84], "grace": 61, "graph": [7, 24], "group": [5, 24], "guid": [3, 40, 73, 83, 89, 90], "h": [0, 1], "h100": [20, 21], "h200": [18, 19, 21, 22], "ha": 20, "hardwar": 86, "hbm": 21, "head": 5, "header": 60, "high": 7, "hopper": [61, 85], "host": 8, "how": [4, 8, 18, 24, 68, 71, 74], "hub": 64, "hug": 64, "i": [20, 71, 84], "ibuff": 1, "id": 9, "igptdecoderbatch": 1, "imag": [27, 28, 60], "implement": [12, 24, 92], "import": 5, "improv": 10, "increas": 22, "indic": 59, "infer": [3, 23, 26, 66, 83, 84], "inform": [7, 67, 83], "infrastructur": 88, "input": 5, "instal": [18, 59, 61, 62, 87], "int4": [19, 85], "int8": [5, 85], "interfac": 93, "intern": 6, "introduct": [36, 64, 90, 93, 94], "ipcnvlsmemori": 1, "ipcutil": 1, "isl": 18, "issu": [18, 84, 88, 89], "itensor": 1, "iter": 67, "kei": [15, 24, 28, 63, 71, 88, 89], "kernel": [22, 24], "knowledg": 73, "known": [60, 84, 88, 89], "kv": [5, 8, 13, 46, 72, 76, 84, 93], "kvcachemanag": 91, "latenc": [18, 22, 24, 68, 70, 72], "latest": [21, 66], "launch": [24, 67], "layer": [24, 78], "layernorm": 13, "layout": 15, "level": [7, 24, 91], "limit": [10, 60, 68, 88], "linear": 78, "link": 60, "linux": [60, 62], "llama": [14, 19, 22, 68, 69, 72, 75], "llama2": 21, "llm": [4, 10, 13, 15, 16, 17, 18, 20, 21, 23, 27, 28, 32, 36, 37, 38, 45, 50, 51, 52, 59, 60, 64, 66, 68, 70, 74, 83, 84, 86, 88], "load": [15, 90], "loader": 15, "local": 64, "logit": [3, 25, 47], "lookahead": [10, 48], "lookaheadbuff": 1, "lookaheadmodul": 1, "lookup": 10, "lora": [9, 25, 53], "loracach": [1, 9], "loracachepagemanagerconfig": 1, "loramodul": 1, "low": [68, 72], "make": 13, "manag": [7, 68, 93], "map": [9, 68], "mark": 3, "marker": 67, "match": 14, "matrix": [85, 86], "max": [18, 68, 74, 76], "maximum": 76, "measur": 69, "medusa": [10, 49, 68], "medusamodul": 1, "memori": [8, 11, 18, 21, 76, 84], "memorycount": 1, "method": [7, 23], "metric": 26, "mgmn": [50, 51, 52], "min": 18, "mix": 24, "mixtur": 4, "mlp": [13, 72, 78], "mlperf": 20, "modal": [68, 86], "mode": 68, "model": [6, 10, 12, 14, 15, 16, 18, 24, 64, 68, 69, 71, 72, 75, 79, 83, 86, 87, 88, 90, 91], "modelconfig": 1, "modul": [7, 9], "moe": 4, "moe_backend": 24, "more": [18, 22, 67], "mtp": 24, "multi": [5, 14, 24, 26, 66, 68, 86], "multimod": [26, 30, 35], "multipl": [53, 75], "name": [15, 25], "nativ": [15, 66], "nearli": 21, "network": 68, "new": [12, 22, 90, 92], "next": [23, 83], "node": [14, 26, 66], "non": 68, "norm": [72, 75], "normal": 78, "note": [3, 5, 88], "nsight": 67, "num": 74, "numer": 85, "nvfp4": 85, "nvidia": [24, 67], "nvtx": 67, "o": 84, "obtain": 3, "offload": 8, "onli": [24, 60, 67, 85], "onlin": 58, "openai": [55, 56, 57], "optim": [5, 24, 75], "option": [18, 60, 72, 75, 76], "osl": 18, "other": 68, "out": [18, 90], "output": [3, 68], "over": 19, "overview": [6, 13, 15, 17, 66, 69], "ovewiew": 91, "own": 94, "p": 8, "pack": 5, "pad": 5, "page": [5, 66, 74, 75, 76], "parallel": [4, 9, 24, 25, 38, 68, 71, 75], "paramet": 6, "parser": 33, "part": 12, "pattern": [7, 14], "perf": [34, 35], "perform": [8, 10, 18, 20, 23, 24, 59, 67, 70, 72, 73, 75], "persist": 68, "phase": 5, "pipelin": [71, 75], "pitfal": 70, "plugin": [14, 25, 72, 75, 80], "pod": 28, "polici": 76, "pool": [78, 84], "posit": 5, "post": 3, "postprocess": 15, "power": 68, "practic": 23, "precis": [24, 85], "prepar": [13, 18, 28, 64, 68, 69, 70], "prerequisit": [18, 60, 73, 83, 90], "prevent": 8, "processor": [3, 47], "profil": [24, 67, 75], "programmat": 24, "prompt": 10, "prompttuningparam": 1, "provid": 22, "push": 24, "pyexecutor": 91, "python": [3, 60, 84], "pytorch": [67, 68, 86, 89, 90], "q": 85, "qkv": 5, "quantiz": [13, 17, 23, 32, 54, 68, 72, 81, 85, 89], "quantmod": 85, "queri": 5, "quick": [83, 89], "quickstart": 68, "r1": [18, 24, 33], "rab": 5, "rank": 13, "rawengin": 1, "re": 24, "reason": 33, "recommend": [72, 75, 84], "record_signatur": 7, "redraft": 10, "reduc": [11, 72, 75], "refer": [12, 59, 65], "regist": 12, "registr": 90, "rel": 5, "relat": [7, 83], "relax": 24, "releas": 88, "reproduc": [18, 24, 69], "request": [1, 3], "requir": 7, "resourcemanag": 91, "respons": 3, "result": [3, 18, 67, 69, 70], "retriev": 7, "reus": 8, "revisit": 74, "rewrit": 7, "right": 23, "roll": 5, "rope": 5, "rotari": 5, "router": 24, "routergemm": 24, "run": [9, 11, 18, 67, 68, 69, 70, 83], "runpod": 28, "runtim": [1, 6, 9, 14, 32, 60, 76, 82, 84], "runtimedefault": 1, "same": 22, "sampl": [6, 10, 32], "samplingconfig": 1, "save": 70, "scale": [5, 13], "scatter": 75, "schedul": [74, 76, 91, 94], "script": [37, 58], "search": 5, "sec": 21, "send": 3, "serial": 0, "serv": [26, 52, 58, 67, 83], "server": [3, 26, 83], "servic": 2, "set": [68, 71], "shard": 71, "shoot": 15, "singl": 19, "situat": 8, "size": [74, 76, 84], "slide": 5, "slurm": 26, "smart": 24, "smoothquant": 85, "softwar": 86, "sota": 23, "sourc": 60, "spars": 24, "specif": 67, "specul": [10, 25], "speculativedecodingmod": 1, "speculativedecodingmodul": 1, "speed": 23, "ssh": [27, 28], "start": [26, 59, 83, 89], "step": [12, 18, 60, 83, 90], "strategi": [24, 71], "stream": [11, 24, 43], "streamingllm": 5, "structur": 3, "studi": 74, "style": 32, "subcommand": 68, "summari": [68, 72, 75], "support": [14, 15, 18, 60, 64, 66, 68, 85, 86], "swiglu": 72, "syntax": 26, "system": [24, 67], "tabl": [18, 24, 59, 73, 90], "target": 10, "technic": 85, "techniqu": 23, "templat": 28, "tensor": [0, 3, 4, 5, 7, 9, 71, 84], "tensorrt": [4, 10, 13, 14, 15, 16, 17, 18, 20, 21, 23, 27, 28, 59, 60, 64, 66, 68, 70, 74, 83, 84, 86, 88], "test": 87, "text": [39, 40, 41, 42, 43, 44, 47, 48, 49, 53], "think": 71, "throughput": [18, 22, 68, 69, 70], "time": [75, 84], "tip": [64, 70, 87], "tllmlogger": 1, "tok": 20, "token": [20, 21, 32, 74, 76], "tool": 17, "top": 91, "translat": 15, "tree": [10, 90], "triton": [3, 83], "troubl": 15, "troubleshoot": [2, 64, 70, 87], "trt": 23, "trtllm": [24, 25, 26, 51, 52, 67, 70, 83], "tune": [8, 18, 73, 74], "type": 0, "understand": [74, 84], "unit": 87, "up": [19, 22, 23], "updat": 88, "upload": [27, 28], "us": [7, 9, 10, 39, 47, 48, 49, 75, 76, 84], "usag": [2, 84], "user": 72, "v": [4, 21], "valid": 68, "variabl": [2, 69], "verif": 24, "verifi": 12, "via": 70, "visual": 67, "w4a16": 85, "w8a16": 85, "w8a8": 85, "weight": [11, 12, 13, 14, 15, 16, 84, 85, 90], "welcom": 59, "what": [20, 23, 66], "when": [7, 24], "width": 3, "window": [5, 66, 76], "wip": 18, "within": 22, "without": 60, "work": [24, 68], "workflow": [7, 15, 17, 67, 68], "workload": 24, "world": 6, "worldconfig": 1, "write": 12, "xqa": [5, 22], "you": [66, 70], "your": 94}})
\ No newline at end of file
+Search.setIndex({"alltitles": {"1. Download TensorRT-LLM": [[20, "download-tensorrt-llm"]], "1. Weights size": [[89, "weights-size"]], "2. Activation size": [[89, "activation-size"]], "2. Download the DeepSeek R1 models": [[20, "download-the-deepseek-r1-models"]], "3. Build and run TensorRT-LLM container": [[20, "build-and-run-tensorrt-llm-container"]], "3. I/O tensors": [[89, "i-o-tensors"]], "3.1 Runtime and decoder buffers except KV cache tensor": [[89, "runtime-and-decoder-buffers-except-kv-cache-tensor"]], "3.2 KV cache tensor": [[89, "kv-cache-tensor"]], "4. Compile and Install TensorRT-LLM": [[20, "compile-and-install-tensorrt-llm"]], "5. Optional: Tune GPU clocks": [[20, "optional-tune-gpu-clocks"]], "6. Dataset preparation": [[20, "dataset-preparation"]], "@record_signature to Decorate Functionals Requiring FLayerInfo": [[7, "record-signature-to-decorate-functionals-requiring-flayerinfo"]], "ALiBi": [[5, "alibi"]], "API": [[3, "api"]], "API Changes": [[13, "api-changes"], [93, "api-changes"], [93, "id9"], [93, "id14"], [93, "id19"], [93, "id24"], [93, "id31"], [93, "id36"], [93, "id42"], [93, "id48"], [93, "id54"]], "API Introduction": [[69, null]], "API Reference": [[70, null]], "AWQ Quantization Scaling Factors": [[15, "awq-quantization-scaling-factors"]], "About": [[30, "about"]], "About Speculative Sampling": [[12, "about-speculative-sampling"]], "About TensorRT-LLM": [[71, "about-tensorrt-llm"]], "Accuracy": [[25, "accuracy"]], "Accuracy studies for Relaxed Acceptance": [[27, "accuracy-studies-for-relaxed-acceptance"]], "Achieving speedup with MTP speculative decoding": [[27, "achieving-speedup-with-mtp-speculative-decoding"]], "Acknowledgment": [[26, "acknowledgment"], [27, "acknowledgment"], [28, "acknowledgment"]], "Activation": [[83, "module-tensorrt_llm.layers.activation"]], "Adding a Model": [[14, null]], "Adding a New Model in PyTorch Backend": [[95, null]], "Advanced": [[64, null]], "Algorithm": [[11, "algorithm"]], "Announcements": [[93, "announcements"], [93, "id52"]], "Architecture": [[64, null]], "Architecture Ovewiew": [[96, null]], "Asyncio-Based Generation": [[36, "asyncio-based-generation"]], "Attention": [[83, "module-tensorrt_llm.layers.attention"], [97, null]], "Attention Backends": [[97, "attention-backends"]], "Attention Kernel": [[26, "attention-kernel"]], "Attention Weights": [[15, "attention-weights"]], "Attention for MTP": [[27, "attention-for-mtp"]], "Auto parallel arguments": [[29, "tensorrt_llm.commands.build-parse_arguments-auto-parallel-arguments"]], "Automatic Parallelism with LLM": [[42, null]], "Autoregressive MTP Layers": [[26, "autoregressive-mtp-layers"]], "B200 max-throughput": [[20, "b200-max-throughput"]], "B200 min-latency": [[20, "b200-min-latency"]], "Background": [[26, "background"], [27, "background"]], "Basic Implementation": [[27, "basic-implementation"]], "Beam-Search": [[5, "beam-search"]], "Before Benchmarking": [[73, "before-benchmarking"]], "Before You Begin: TensorRT-LLM LLM-API": [[75, "before-you-begin-tensorrt-llm-llm-api"]], "Benchmark": [[20, "benchmark"], [25, "benchmark"], [30, "benchmark"]], "Benchmarking Default Performance": [[75, null]], "Benchmarking a non-Medusa Low Latency Engine": [[73, "benchmarking-a-non-medusa-low-latency-engine"]], "Benchmarking with trtllm-bench": [[75, "benchmarking-with-trtllm-bench"]], "Benchmarks": [[2, "benchmarks"]], "Best practices to choose the right quantization methods": [[25, "best-practices-to-choose-the-right-quantization-methods"]], "Block": [[8, "block"]], "Boost settings": [[73, "boost-settings"]], "Build APIs": [[19, "build-apis"]], "Build Checkpoint into TensorRT Engine": [[15, "build-checkpoint-into-tensorrt-engine"]], "Build Configuration": [[36, "build-configuration"]], "Build TensorRT-LLM": [[65, "build-tensorrt-llm"]], "Build the TensorRT-LLM Docker Image": [[31, null]], "Build the TensorRT-LLM Docker Image and Upload to DockerHub": [[31, "build-the-tensorrt-llm-docker-image-and-upload-to-dockerhub"], [32, "build-the-tensorrt-llm-docker-image-and-upload-to-dockerhub"]], "Building a Benchmark Engine": [[73, "building-a-benchmark-engine"]], "Building a Medusa Low-Latency Engine": [[73, "building-a-medusa-low-latency-engine"]], "Building a TensorRT-LLM Docker Image": [[65, "building-a-tensorrt-llm-docker-image"]], "Building and Saving Engines via CLI": [[75, "building-and-saving-engines-via-cli"]], "Building and Saving the Engine": [[75, "building-and-saving-the-engine"]], "Building from Source Code on Linux": [[65, null]], "Building the Python Bindings for the C++ Runtime": [[65, "building-the-python-bindings-for-the-c-runtime"]], "C++ Executor API Example": [[3, "c-executor-api-example"]], "C++ GPT Runtime": [[6, null]], "C++ runtime": [[89, "c-runtime"], [89, "id1"]], "CLI Tools": [[19, "cli-tools"]], "CUDA Graph & Programmatic Dependent Launch": [[26, "cuda-graph-programmatic-dependent-launch"]], "CUTLASS Backend (default backend)": [[26, "cutlass-backend-default-backend"]], "Capacity Scheduler Policy": [[81, "capacity-scheduler-policy"]], "Cast": [[83, "module-tensorrt_llm.layers.cast"]], "Chat API": [[30, "chat-api"]], "Chunked Context": [[5, "chunked-context"]], "Classical Workflow": [[7, "classical-workflow"]], "Closing": [[21, "closing"], [24, "closing"]], "Collect PyTorch profiler results": [[72, "collect-pytorch-profiler-results"]], "Command Overview": [[74, "command-overview"]], "Common LLM Support": [[71, "common-llm-support"]], "Communication Kernel": [[26, "communication-kernel"]], "Compilation": [[16, "compilation"]], "Compile the Model into a TensorRT Engine": [[88, "compile-the-model-into-a-tensorrt-engine"]], "Completions API": [[30, "completions-api"], [30, "id1"]], "Conclusion": [[77, "conclusion"], [79, "conclusion"], [80, "conclusion"]], "Config": [[15, "config"]], "Configure SSH Key": [[32, "configure-ssh-key"]], "Configure The Executor": [[3, "configure-the-executor"]], "Connect to the Pod": [[32, "connect-to-the-pod"]], "Context Chunking Policy": [[81, "context-chunking-policy"]], "Context Phase": [[5, "context-phase"]], "Context and Generation Phases": [[5, "context-and-generation-phases"]], "Contiguous KV Cache": [[5, "contiguous-kv-cache"]], "Control generated text using logits processor": [[52, null]], "Controlling output with Logits Post-Processor": [[3, "controlling-output-with-logits-post-processor"]], "Conv": [[83, "module-tensorrt_llm.layers.conv"]], "Conversion APIs": [[19, "conversion-apis"]], "Coordinating with NVIDIA Nsight Systems Launch": [[72, "coordinating-with-nvidia-nsight-systems-launch"]], "Coordinating with PyTorch profiler (PyTorch workflow only)": [[72, "coordinating-with-pytorch-profiler-pytorch-workflow-only"]], "Core Models": [[95, "core-models"]], "Create a Pod Template": [[32, "create-a-pod-template"]], "Create a Runpod account": [[32, "create-a-runpod-account"]], "Create the Container": [[65, "create-the-container"]], "Cross Attention": [[5, "cross-attention"]], "Curl Chat Client": [[33, null]], "Curl Chat Client For Multimodal": [[34, null]], "Curl Completion Client": [[35, null]], "Customize KV Cache Manager": [[98, "customize-kv-cache-manager"]], "Customize Your Own Scheduler": [[99, "customize-your-own-scheduler"]], "Data Parallel for Attention module (ADP)": [[28, "data-parallel-for-attention-module-adp"]], "Debug Execution Errors": [[92, "debug-execution-errors"]], "Debug on E2E Models": [[92, "debug-on-e2e-models"]], "Debug on Unit Tests": [[92, "debug-on-unit-tests"]], "Debugging FAQs": [[2, "debugging-faqs"]], "Deciding Model Sharding Strategy": [[76, null]], "Decoder": [[96, "decoder"]], "DeepSeek R1 MTP Implementation and Optimization": [[27, null]], "Deepseek R1 Reasoning Parser": [[37, null]], "Default Build Behavior": [[73, "default-build-behavior"]], "Dense GEMM optimization": [[26, "dense-gemm-optimization"]], "Deploy with Triton Inference Server": [[88, "deploy-with-triton-inference-server"]], "Deploy with trtllm-serve": [[88, "deploy-with-trtllm-serve"]], "Develop TensorRT-LLM on Runpod": [[32, null]], "Developer Guide": [[94, "developer-guide"]], "Disable Tokenizer": [[36, "disable-tokenizer"]], "Disaggregated-Service (experimental)": [[2, null]], "Distributed LLM Generation": [[50, null]], "DoRA": [[10, "dora"]], "Documentation": [[93, "documentation"], [93, "id28"]], "Draft-Target-Model": [[12, "draft-target-model"]], "EAGLE": [[12, "eagle"]], "Eagle3 support": [[27, "eagle3-support"]], "Embedding": [[83, "module-tensorrt_llm.layers.embedding"]], "Enable GIL information in NVTX markers": [[72, "enable-gil-information-in-nvtx-markers"]], "Enable garbage collection (GC) NVTX markers": [[72, "enable-garbage-collection-gc-nvtx-markers"]], "Enable kv cache reuse for p-tuning": [[9, "enable-kv-cache-reuse-for-p-tuning"]], "Enable more NVTX markers for debugging": [[72, "enable-more-nvtx-markers-for-debugging"]], "Enable ssh access to the container": [[31, "enable-ssh-access-to-the-container"]], "Enabling GEMM + SwiGLU Fusion": [[77, "enabling-gemm-swiglu-fusion"]], "Enabling GEMM Plugin": [[80, "enabling-gemm-plugin"]], "Enabling Low Latency GEMM plugin": [[77, "enabling-low-latency-gemm-plugin"]], "Enabling Paged Context Attention": [[80, "enabling-paged-context-attention"]], "Enabling Quantization": [[77, "enabling-quantization"]], "Enabling Quantized KV Cache": [[77, "enabling-quantized-kv-cache"]], "Enabling Reduce Norm Fusion Plugin": [[80, "enabling-reduce-norm-fusion-plugin"]], "Enabling Reduce Norm Fusion with User Buffers": [[77, "enabling-reduce-norm-fusion-with-user-buffers"]], "Enabling building with multiple profiles": [[80, "enabling-building-with-multiple-profiles"]], "Environment Variables": [[2, "environment-variables"], [11, "environment-variables"]], "Evaluation": [[27, "evaluation"]], "Events in KVCacheEventManager": [[8, "events-in-kvcacheeventmanager"]], "Everything in One Diagram": [[26, "everything-in-one-diagram"]], "Example": [[2, "example"], [15, "example"]], "Example LoRA tensors": [[10, "example-lora-tensors"]], "Example of Build Subcommand Output:": [[73, "example-of-build-subcommand-output"]], "Examples": [[16, "examples"], [17, "examples"], [72, "examples"]], "Executor": [[0, null]], "Executor API": [[3, null]], "Expected Result Format": [[20, "expected-result-format"], [20, "id1"], [20, "id2"]], "Expected Results": [[20, "expected-results"]], "Expert Parallelism in TensorRT-LLM": [[4, null]], "Expert parallel for MoE (EP)": [[28, "expert-parallel-for-moe-ep"]], "Exploring more ISL/OSL combinations": [[20, "exploring-more-isl-osl-combinations"]], "FAQ": [[89, "faq"]], "FLayerInfo for Retrieving High-Level Information for a Functional": [[7, "flayerinfo-for-retrieving-high-level-information-for-a-functional"]], "FP32, FP16 and BF16": [[90, "fp32-fp16-and-bf16"]], "FP4 Models:": [[74, "fp4-models"]], "FP8 (Hopper)": [[90, "fp8-hopper"]], "FP8 Context FMHA": [[5, "fp8-context-fmha"]], "FP8 Models:": [[74, "fp8-models"]], "FP8 Quantization": [[77, null]], "FP8 Quantization Scaling Factors": [[15, "fp8-quantization-scaling-factors"]], "FP8 Support": [[71, "fp8-support"]], "FP8 \u201cBaseline\u201d Performance": [[77, "fp8-baseline-performance"]], "Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100": [[21, null]], "Falcon-180B on a single H200 with INT4 AWQ": [[21, "falcon-180b-on-a-single-h200-with-int4-awq"]], "Feature Descriptions": [[72, "feature-descriptions"]], "Fix known issues": [[27, "fix-known-issues"]], "Fixed Issues": [[93, "fixed-issues"], [93, "id11"], [93, "id15"], [93, "id21"], [93, "id26"], [93, "id33"], [93, "id38"], [93, "id44"], [93, "id50"], [93, "id56"], [93, "id61"]], "Fully customized": [[17, "fully-customized"]], "Functionals": [[82, null]], "Fuse_A_GEMM": [[26, "fuse-a-gemm"]], "Future Works": [[26, "future-works"], [27, "future-works"], [28, "future-works"]], "Future-Style Generation": [[36, "future-style-generation"]], "GEMM + SwiGLU Fusion in Gated-MLP": [[77, "gemm-swiglu-fusion-in-gated-mlp"]], "GEMM Plugin": [[80, "gemm-plugin"]], "GPTQ and AWQ (W4A16)": [[90, "gptq-and-awq-w4a16"]], "GPU Clock Management": [[73, "gpu-clock-management"]], "Genai Perf Client": [[38, null]], "Genai Perf Client For Multimodal": [[39, null]], "General FAQs": [[2, "general-faqs"]], "Generate Text Asynchronously": [[47, null]], "Generate Text Using Eagle Decoding": [[44, null]], "Generate Text Using Eagle2 Decoding": [[43, null]], "Generate Text Using Lookahead Decoding": [[53, null]], "Generate Text Using Medusa Decoding": [[54, null]], "Generate Text in Streaming": [[48, null]], "Generate text": [[46, null]], "Generate text with customization": [[49, null]], "Generate text with guided decoding": [[45, null]], "Generate text with multiple LoRA adapters": [[58, null]], "Generation": [[36, "generation"]], "Generation Phase": [[5, "generation-phase"]], "Generation with Quantization": [[59, null]], "Get KV Cache Events": [[51, null]], "Getting Started": [[64, null]], "Graph Rewriting APIs": [[7, "graph-rewriting-apis"]], "Graph Rewriting Module": [[7, null]], "Grouped GEMM": [[26, "grouped-gemm"]], "H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token": [[22, null]], "H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM": [[23, null]], "H200 max-throughput": [[20, "h200-max-throughput"]], "H200 min-latency": [[20, "h200-min-latency"]], "H200 vs H100": [[23, "h200-vs-h100"]], "Hardware": [[91, "hardware"]], "Hierarchy: Pool, Block, and Page": [[8, "hierarchy-pool-block-and-page"]], "How the Benchmarker Works": [[73, "how-the-benchmarker-works"]], "How to Enable": [[4, "how-to-enable"]], "How to Think about Model Sharding: Communication is Key": [[76, "how-to-think-about-model-sharding-communication-is-key"]], "How to change Max Batch Size": [[79, "how-to-change-max-batch-size"]], "How to change Max Num Tokens": [[79, "how-to-change-max-num-tokens"]], "How to enable kv cache reuse": [[9, "how-to-enable-kv-cache-reuse"]], "How to get best performance on DeepSeek-R1 in TensorRT-LLM": [[20, null]], "How to reproduce": [[26, "how-to-reproduce"], [28, "how-to-reproduce"]], "How to run DeepSeek models with MTP": [[27, "how-to-run-deepseek-models-with-mtp"]], "How to run the DeepSeek-R1 model with Relaxed Acceptance": [[27, "how-to-run-the-deepseek-r1-model-with-relaxed-acceptance"]], "How to set Tensor Parallelism and Pipeline Parallelism": [[76, "how-to-set-tensor-parallelism-and-pipeline-parallelism"]], "Hugging Face Hub": [[69, "hugging-face-hub"]], "INT4 and INT8 Weight-Only (W4A16 and W8A16)": [[90, "int4-and-int8-weight-only-w4a16-and-w8a16"]], "INT8 SmoothQuant (W8A8)": [[90, "int8-smoothquant-w8a8"]], "INT8/FP8 KV Caches": [[5, "int8-fp8-kv-caches"]], "Implement AttentionBackend": [[97, "implement-attentionbackend"]], "Implement AttentionMetadata": [[97, "implement-attentionmetadata"]], "Implement a New Attention Backend": [[97, "implement-a-new-attention-backend"]], "Implementation Configuration": [[26, "implementation-configuration"]], "Important Note": [[5, "important-note"]], "In-Flight Batching and Paged Attention": [[71, "in-flight-batching-and-paged-attention"]], "In-flight Batching": [[5, "in-flight-batching"]], "In-flight Batching with the Triton Inference Server": [[3, "in-flight-batching-with-the-triton-inference-server"]], "Indices and tables": [[64, "indices-and-tables"]], "Inference Endpoints": [[30, "inference-endpoints"]], "Infrastructure Changes": [[93, "infrastructure-changes"], [93, "id4"], [93, "id7"], [93, "id12"], [93, "id16"], [93, "id22"], [93, "id27"], [93, "id34"], [93, "id39"], [93, "id45"]], "Infrastructure changes": [[93, "id51"]], "Input QKV tensor": [[5, "input-qkv-tensor"]], "Installation": [[64, null]], "Installation Errors": [[92, "installation-errors"]], "Installing on Grace Hopper": [[66, null]], "Installing on Linux": [[67, null]], "Interfaces": [[98, "interfaces"]], "Internal Components": [[6, "internal-components"]], "Introduction": [[28, "introduction"], [95, "introduction"]], "KV Cache": [[5, "kv-cache"]], "KV Cache Management: Pools, Blocks, and Events": [[8, null]], "KV Cache Manager": [[98, null]], "KV Cache Manager Introduction": [[98, "kv-cache-manager-introduction"]], "KV Cache Pool Management": [[8, "kv-cache-pool-management"]], "KV Cache Quantization Scaling Factors": [[15, "kv-cache-quantization-scaling-factors"]], "KV cache reuse": [[9, null]], "KVCacheManager": [[96, "kvcachemanager"]], "Kernel Level optimizations": [[26, "kernel-level-optimizations"]], "Kernel fusion": [[26, "kernel-fusion"]], "Key Components": [[94, "key-components"]], "Key Features": [[68, null]], "Key Features and Enhancements": [[93, "key-features-and-enhancements"], [93, "id2"], [93, "id3"], [93, "id5"], [93, "id8"], [93, "id13"], [93, "id18"], [93, "id23"], [93, "id30"], [93, "id35"], [93, "id41"], [93, "id47"], [93, "id53"], [93, "id57"], [93, "id59"]], "Key Optimizations": [[26, "key-optimizations"]], "Known Issues": [[89, "known-issues"], [93, "known-issues"], [93, "id6"], [93, "id10"], [93, "id17"], [93, "id29"], [93, "id40"], [93, "id46"], [93, "id62"], [94, "known-issues"]], "Known Limitations": [[65, "known-limitations"]], "LLM API": [[88, "llm-api"]], "LLM API Examples": [[40, null]], "LLM Common Customizations": [[36, null]], "LLM Examples": [[41, null]], "LLM Examples Introduction": [[40, null]], "LLM Models": [[91, "llm-models"]], "Latest GPU Support": [[71, "latest-gpu-support"]], "Latest HBM Memory": [[23, "latest-hbm-memory"]], "LayerNorm Weights": [[15, "layernorm-weights"]], "Layers": [[83, null]], "Limitations": [[12, "limitations"], [93, "limitations"]], "Limitations and Caveats": [[73, "limitations-and-caveats"]], "Linear": [[83, "module-tensorrt_llm.layers.linear"]], "Linking with the TensorRT-LLM C++ Runtime": [[65, "linking-with-the-tensorrt-llm-c-runtime"]], "Llama 3.1 405B": [[16, "llama-3-1-405b"]], "Llama 3.1 405B FP4": [[74, "llama-3-1-405b-fp4"]], "Llama 3.1 405B FP8": [[74, "llama-3-1-405b-fp8"]], "Llama 3.1 70B": [[16, "llama-3-1-70b"]], "Llama 3.1 70B FP8": [[74, "llama-3-1-70b-fp8"]], "Llama 3.1 8B FP8": [[74, "llama-3-1-8b-fp8"]], "Llama 3.3 70B FP4": [[74, "llama-3-3-70b-fp4"]], "Llama-70B on H200 up to 2.4x increased throughput with XQA within same latency budget": [[24, "llama-70b-on-h200-up-to-2-4x-increased-throughput-with-xqa-within-same-latency-budget"]], "Llama-70B on H200 up to 6.7x A100": [[21, "llama-70b-on-h200-up-to-6-7x-a100"]], "Llm Mgmn Llm Distributed": [[55, null]], "Llm Mgmn Trtllm Bench": [[56, null]], "Llm Mgmn Trtllm Serve": [[57, null]], "LoRA Module id mapping": [[10, "lora-module-id-mapping"]], "LoRA arguments": [[29, "tensorrt_llm.commands.build-parse_arguments-lora-arguments"]], "LoRA tensor format details": [[10, "lora-tensor-format-details"]], "LoRA with tensor parallel": [[10, "lora-with-tensor-parallel"]], "Loading function": [[17, "loading-function"]], "Local Hugging Face Models": [[69, "local-hugging-face-models"]], "Local TensorRT-LLM Engine": [[69, "local-tensorrt-llm-engine"]], "Logits arguments": [[29, "tensorrt_llm.commands.build-parse_arguments-logits-arguments"]], "Lookahead Decoding": [[12, "lookahead-decoding"]], "LoraCache configuration": [[10, "loracache-configuration"]], "Low Latency Benchmark": [[73, "low-latency-benchmark"]], "Low Latency GEMM Plugin": [[77, "low-latency-gemm-plugin"]], "Low Latency TensorRT-LLM Engine for Llama-3 70B": [[73, "low-latency-tensorrt-llm-engine-for-llama-3-70b"]], "Low-Precision-AllReduce": [[11, null]], "MLA Layers Optimizations": [[28, "mla-layers-optimizations"]], "MLP": [[83, "module-tensorrt_llm.layers.mlp"]], "MLP Weights": [[15, "mlp-weights"]], "MLPerf on H100 with FP8": [[22, "mlperf-on-h100-with-fp8"]], "MTP": [[26, "mtp"]], "MTP Eagle": [[27, "mtp-eagle"]], "MTP Modules": [[27, "mtp-modules"]], "MTP Vanilla": [[27, "mtp-vanilla"]], "MTP for inference": [[27, "mtp-for-inference"]], "MTP implementation in TensorRT-LLM": [[27, "mtp-implementation-in-tensorrt-llm"]], "MTP optimization - Relaxed Acceptance": [[27, "mtp-optimization-relaxed-acceptance"]], "Make Evaluation": [[15, "make-evaluation"]], "Mark Tensors As Output": [[3, "mark-tensors-as-output"]], "Max Throughput Benchmark": [[73, "max-throughput-benchmark"]], "Max Tokens in Paged KV Cache and KV Cache Free GPU Memory Fraction": [[81, "max-tokens-in-paged-kv-cache-and-kv-cache-free-gpu-memory-fraction"]], "Maximum Attention Window Size": [[81, "maximum-attention-window-size"]], "Medusa": [[12, "medusa"]], "Medusa Tree": [[12, "medusa-tree"]], "Memory Usage of TensorRT-LLM": [[89, null]], "Memory pool": [[89, "memory-pool"]], "Metrics Endpoint": [[30, "metrics-endpoint"]], "Mixed ETP": [[26, "mixed-etp"]], "Mixture of Experts (MoE)": [[4, "mixture-of-experts-moe"]], "MoE Layers Optimizations": [[28, "moe-layers-optimizations"]], "Model Architecture": [[26, "model-architecture"]], "Model Configuration": [[6, "model-configuration"], [95, "model-configuration"]], "Model Definition": [[16, null], [95, "model-definition"]], "Model Definition API": [[88, "model-definition-api"]], "Model Engine": [[16, "model-engine"], [96, "model-engine"]], "Model Preparation": [[69, "model-preparation"]], "Model Registration": [[95, "model-registration"]], "Model Updates": [[93, "model-updates"], [93, "id20"], [93, "id25"], [93, "id32"], [93, "id37"], [93, "id43"], [93, "id49"], [93, "id55"], [93, "id58"], [93, "id60"]], "Model Weights": [[18, "model-weights"]], "Models": [[84, null]], "Models (PyTorch Backend)": [[91, "models-pytorch-backend"]], "Models (TensorRT Backend)": [[91, "models-tensorrt-backend"]], "Models with customized key names": [[17, "models-with-customized-key-names"]], "Models with customized weight layout": [[17, "models-with-customized-weight-layout"]], "Multi-GPU Multi-Node Inference": [[71, "multi-gpu-multi-node-inference"]], "Multi-GPU and Multi-Node Support": [[16, "multi-gpu-and-multi-node-support"]], "Multi-Head, Multi-Query, and Group-Query Attention": [[5, null]], "Multi-Modal Models 3": [[91, "multi-modal-models"]], "Multi-node Serving with Slurm": [[30, "multi-node-serving-with-slurm"]], "Multi-streams": [[26, "multi-streams"]], "Multimodal Serving": [[30, "multimodal-serving"]], "Multiple Profiles": [[80, "multiple-profiles"]], "NVFP4 (Blackwell)": [[90, "nvfp4-blackwell"]], "Named Arguments": [[29, "tensorrt_llm.commands.build-parse_arguments-named-arguments"]], "Native Windows Support": [[71, "native-windows-support"]], "Natively supported models": [[17, "natively-supported-models"]], "New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget": [[24, null]], "Next Steps": [[88, "next-steps"]], "Normalization": [[83, "module-tensorrt_llm.layers.normalization"]], "Note on context outputs": [[3, "note-on-context-outputs"]], "Numerical Precision": [[90, null]], "Obtaining Arbitrary Output Tensors": [[3, "obtaining-arbitrary-output-tensors"]], "Offloading to host memory": [[9, "offloading-to-host-memory"]], "Online Serving Examples": [[63, null]], "Only collect specific iterations": [[72, "only-collect-specific-iterations"]], "OpenAI Chat Client": [[60, null], [61, null]], "OpenAI Completion Client": [[62, null]], "Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers": [[28, null]], "Option 1: Build TensorRT-LLM in One Step": [[65, "option-1-build-tensorrt-llm-in-one-step"]], "Option 1: Full Build with C++ Compilation": [[65, "option-1-full-build-with-c-compilation"]], "Option 2: Build TensorRT-LLM Step-by-Step": [[65, "option-2-build-tensorrt-llm-step-by-step"]], "Option 2: Python-Only Build without C++ Compilation": [[65, "option-2-python-only-build-without-c-compilation"]], "Other Build Modes": [[73, "other-build-modes"]], "Out of memory issues": [[20, "out-of-memory-issues"]], "Out-of-Tree Models": [[95, "out-of-tree-models"]], "Overview": [[6, "overview"], [15, "overview"], [17, "overview"], [19, "overview"], [71, null], [74, null]], "Padded and Packed Tensors": [[5, "padded-and-packed-tensors"]], "Page": [[8, "page"]], "Paged Context Attention": [[80, "paged-context-attention"]], "Paged KV Cache": [[5, "paged-kv-cache"]], "Parallel strategy": [[28, "parallel-strategy"]], "Parallelism Mapping Support": [[73, "parallelism-mapping-support"]], "Parallelism Strategy": [[26, "parallelism-strategy"]], "Pattern and Pattern Manager": [[7, "pattern-and-pattern-manager"]], "Pattern-Matching and Fusion": [[16, "pattern-matching-and-fusion"]], "Performance": [[25, "performance"], [64, null], [80, "performance"]], "Performance Analysis": [[72, null]], "Performance Improvements": [[12, "performance-improvements"]], "Performance Tuning Guide": [[78, null]], "Performance and Accuracy Considerations": [[11, "performance-and-accuracy-considerations"]], "Performance expectations": [[9, "performance-expectations"]], "Performance with GEMM + SwiGLU Fusion": [[77, "performance-with-gemm-swiglu-fusion"]], "Performance with GEMM Plugin": [[80, "performance-with-gemm-plugin"]], "Performance with Low Latency GEMM plugin": [[77, "performance-with-low-latency-gemm-plugin"]], "Performance with Quantized KV Cache": [[77, "performance-with-quantized-kv-cache"]], "Performance with Reduce Norm Fusion": [[80, "performance-with-reduce-norm-fusion"]], "Performance with Reduce Norm Fusion + User Buffers:": [[77, "performance-with-reduce-norm-fusion-user-buffers"]], "Performance with multiple profiles": [[80, "performance-with-multiple-profiles"]], "Persistence mode": [[73, "persistence-mode"]], "Pipeline Parallel Reduce Scatter Optimization": [[80, "pipeline-parallel-reduce-scatter-optimization"]], "Plugin": [[85, null]], "Plugin config arguments": [[29, "tensorrt_llm.commands.build-parse_arguments-plugin-config-arguments"]], "Plugins": [[16, "plugins"]], "Pool": [[8, "pool"]], "Pooling": [[83, "module-tensorrt_llm.layers.pooling"]], "Postprocessing functions": [[17, "postprocessing-functions"]], "Precision Strategy": [[26, "precision-strategy"]], "Precision strategy": [[28, "precision-strategy"]], "Prepare": [[32, "prepare"]], "Prepare Dataset": [[75, "prepare-dataset"]], "Prepare the TensorRT-LLM Checkpoint": [[15, "prepare-the-tensorrt-llm-checkpoint"]], "Preparing a Dataset": [[73, "preparing-a-dataset"], [74, "preparing-a-dataset"]], "Prerequisite Knowledge": [[78, "prerequisite-knowledge"]], "Prerequisites": [[65, "prerequisites"], [88, "prerequisites"], [95, "prerequisites"]], "Prerequisites: Install TensorRT-LLM and download models": [[20, "prerequisites-install-tensorrt-llm-and-download-models"]], "Profiling specific iterations on a trtllm-bench/trtllm-serve run": [[72, "profiling-specific-iterations-on-a-trtllm-bench-trtllm-serve-run"]], "Prompt-Lookup-Decoding": [[12, "prompt-lookup-decoding"]], "Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs": [[26, null]], "PyExecutor": [[96, "pyexecutor"]], "PyTorch Backend": [[94, null]], "Python Bindings for the Executor API": [[3, "python-bindings-for-the-executor-api"]], "Python runtime (Not recommended to be used)": [[89, "python-runtime-not-recommended-to-be-used"]], "Quantization": [[36, "quantization"], [86, null], [94, "quantization"]], "Quantization APIs": [[19, "quantization-apis"]], "Quantization and Dequantization (Q/DQ)": [[90, "quantization-and-dequantization-q-dq"]], "Quantization in TensorRT-LLM": [[25, "quantization-in-tensorrt-llm"]], "Quantization in the PyTorch Flow": [[73, "quantization-in-the-pytorch-flow"]], "Quantized KV-Cache": [[77, "quantized-kv-cache"]], "Quick Start": [[94, "quick-start"]], "Quick Start Guide": [[88, null]], "Quickstart": [[73, "quickstart"]], "Rank Weights": [[15, "rank-weights"]], "Re-balanced the sparse experts": [[26, "re-balanced-the-sparse-experts"]], "ReDrafter": [[12, "redrafter"]], "Reduce Norm Fusion Plugin for Llama models:": [[80, "reduce-norm-fusion-plugin-for-llama-models"]], "Reduce Norm Fusion with User Buffers for Llama Models": [[77, "reduce-norm-fusion-with-user-buffers-for-llama-models"]], "Reference": [[14, "reference"], [64, null]], "Related Information": [[88, "related-information"]], "Relative Attention Bias (RAB)": [[5, "relative-attention-bias-rab"]], "Relax Acceptance Verification": [[26, "relax-acceptance-verification"]], "Relaxed Acceptance": [[27, "relaxed-acceptance"]], "Release Notes": [[93, null]], "Reproducing Benchmarked Results": [[74, "reproducing-benchmarked-results"]], "Reproducing steps": [[20, "reproducing-steps"]], "Request Additional Output": [[3, "request-additional-output"]], "ResourceManager": [[96, "resourcemanager"]], "Results": [[75, "results"]], "Revisiting Paged Context Attention and Context Chunking": [[79, "revisiting-paged-context-attention-and-context-chunking"]], "Rotary Positional Embedding (RoPE)": [[5, "rotary-positional-embedding-rope"]], "RouterGEMM": [[26, "routergemm"]], "Run gpt-2b + LoRA using Executor / cpp runtime": [[10, null]], "Run the Model": [[88, "run-the-model"]], "Running Throughput and Latency Benchmarks": [[75, "running-throughput-and-latency-benchmarks"]], "Running With Weight Streaming to Reduce GPU Memory Consumption": [[13, null]], "Running multi-modal models in the PyTorch Workflow": [[73, "running-multi-modal-models-in-the-pytorch-workflow"]], "Running the Benchmark": [[74, "running-the-benchmark"]], "Running with the PyTorch Workflow": [[73, "running-with-the-pytorch-workflow"]], "Runtime": [[1, null], [16, "runtime"], [87, null]], "Runtime Customization": [[36, "runtime-customization"]], "Runtime Optimizations": [[28, "runtime-optimizations"]], "Sampling": [[36, "sampling"]], "Sampling Parameters": [[6, "sampling-parameters"]], "Scaling factor(s)": [[5, "scaling-factor-s"]], "Scheduler": [[96, "scheduler"], [99, null]], "Scheduler Introduction": [[99, "scheduler-introduction"]], "Scripts": [[41, null], [63, null]], "Sending Requests with Different Beam Widths": [[3, "sending-requests-with-different-beam-widths"]], "Set power limits": [[73, "set-power-limits"]], "Situations that can prevent kv cache reuse": [[9, "situations-that-can-prevent-kv-cache-reuse"]], "Sliding Window Attention, Cyclic (Rolling Buffer) KV Cache": [[5, "sliding-window-attention-cyclic-rolling-buffer-kv-cache"]], "Smart Router": [[26, "smart-router"]], "Software": [[91, "software"]], "Sparse Experts as GEMMs (only works when moe_backend=CUTLASS)": [[26, "sparse-experts-as-gemms-only-works-when-moe-backend-cutlass"]], "Speculative Sampling": [[12, null]], "Speculative decoding arguments": [[29, "tensorrt_llm.commands.build-parse_arguments-speculative-decoding-arguments"]], "Speed up inference with SOTA quantization techniques in TRT-LLM": [[25, null]], "Starting a Server": [[30, "starting-a-server"]], "Step 1. Write Modeling Part": [[14, "step-1-write-modeling-part"]], "Step 2. Implement Weight Conversion": [[14, "step-2-implement-weight-conversion"]], "Step 3. Register New Model": [[14, "step-3-register-new-model"]], "Step 4. Verify New Model": [[14, "step-4-verify-new-model"]], "Step-by-Step Guide": [[95, "step-by-step-guide"]], "StreamingLLM": [[5, "streamingllm"]], "Structured output with guided decoding": [[3, "structured-output-with-guided-decoding"]], "Summary": [[73, "summary"]], "Summary of Configuration Option Recommendations:": [[77, "summary-of-configuration-option-recommendations"], [80, "summary-of-configuration-option-recommendations"]], "Support Matrix": [[91, null]], "Support matrix": [[90, "support-matrix"]], "Supported C++ Header Files": [[65, "supported-c-header-files"]], "Supported Models": [[69, "supported-models"]], "Supported Quantization Modes": [[73, "supported-quantization-modes"]], "Syntax": [[30, "syntax"]], "System Level optimizations": [[26, "system-level-optimizations"]], "TRTLLM Backend": [[26, "trtllm-backend"]], "Table of Contents": [[20, "table-of-contents"], [26, "table-of-contents"], [27, "table-of-contents"], [28, "table-of-contents"], [78, "table-of-contents"], [95, "table-of-contents"]], "Technical Detail: The QuantMode Flags": [[90, "technical-detail-the-quantmode-flags"]], "Tensor Parallel vs Expert Parallel": [[4, "tensor-parallel-vs-expert-parallel"]], "Tensor-Related Methods": [[7, "tensor-related-methods"]], "TensorRT Compiler": [[16, "tensorrt-compiler"]], "TensorRT-LLM Architecture": [[18, null]], "TensorRT-LLM Benchmarking": [[73, null]], "TensorRT-LLM Build Workflow": [[19, null]], "TensorRT-LLM Checkpoint": [[15, null]], "TensorRT-LLM Model Weights Loader": [[17, null]], "TensorRT-LLM Release 0.10.0": [[93, "tensorrt-llm-release-0-10-0"]], "TensorRT-LLM Release 0.11.0": [[93, "tensorrt-llm-release-0-11-0"]], "TensorRT-LLM Release 0.12.0": [[93, "tensorrt-llm-release-0-12-0"]], "TensorRT-LLM Release 0.13.0": [[93, "tensorrt-llm-release-0-13-0"]], "TensorRT-LLM Release 0.14.0": [[93, "tensorrt-llm-release-0-14-0"]], "TensorRT-LLM Release 0.15.0": [[93, "tensorrt-llm-release-0-15-0"]], "TensorRT-LLM Release 0.16.0": [[93, "tensorrt-llm-release-0-16-0"]], "TensorRT-LLM Release 0.17.0": [[93, "tensorrt-llm-release-0-17-0"]], "TensorRT-LLM Release 0.18.0": [[93, "tensorrt-llm-release-0-18-0"]], "TensorRT-LLM Release 0.18.1": [[93, "tensorrt-llm-release-0-18-1"]], "TensorRT-LLM Release 0.18.2": [[93, "tensorrt-llm-release-0-18-2"]], "TensorRT-LLM Release 0.19.0": [[93, "tensorrt-llm-release-0-19-0"]], "TensorRT-LLM Release 0.7.1": [[93, "tensorrt-llm-release-0-7-1"]], "TensorRT-LLM Release 0.8.0": [[93, "tensorrt-llm-release-0-8-0"]], "TensorRT-LLM Release 0.9.0": [[93, "tensorrt-llm-release-0-9-0"]], "The Executor Class": [[3, "the-executor-class"]], "The Request Class": [[3, "the-request-class"]], "The Response Class": [[3, "the-response-class"]], "The Result Class": [[3, "the-result-class"]], "Throughput Benchmarking": [[73, "throughput-benchmarking"]], "Throughput Measurements": [[74, "throughput-measurements"]], "Tips": [[92, "tips"]], "Tips and Troubleshooting": [[69, "tips-and-troubleshooting"]], "Tokenizer Customization": [[36, "tokenizer-customization"]], "Top Level API": [[96, "top-level-api"]], "Topology Requirements": [[11, "topology-requirements"]], "Translator": [[17, "translator"]], "Tree-based speculative decoding support": [[27, "tree-based-speculative-decoding-support"]], "Trouble shooting": [[17, "trouble-shooting"]], "Troubleshooting": [[92, null]], "Troubleshooting Tips and Pitfalls To Avoid": [[75, "troubleshooting-tips-and-pitfalls-to-avoid"]], "Troubleshooting and FAQ": [[2, "troubleshooting-and-faq"]], "Tuning Case Study": [[79, "tuning-case-study"], [79, "id2"]], "Tuning Max Batch Size": [[79, "tuning-max-batch-size"]], "Tuning Max Batch Size and Max Num Tokens": [[79, null]], "Tuning Max Num Tokens": [[79, "tuning-max-num-tokens"]], "Types of Events": [[8, "types-of-events"]], "Understand inference time GPU memory usage": [[89, "understand-inference-time-gpu-memory-usage"]], "Understanding the TensorRT-LLM scheduler": [[79, "understanding-the-tensorrt-llm-scheduler"]], "Upload the Docker Image to DockerHub": [[31, "upload-the-docker-image-to-dockerhub"]], "Usage": [[2, "usage"], [11, "usage"]], "Useful Build-Time Flags": [[80, null]], "Useful Runtime Options": [[81, null]], "Using Medusa with TensorRT-LLM": [[12, "using-medusa-with-tensorrt-llm"]], "Validated Networks for Benchmarking": [[73, "validated-networks-for-benchmarking"]], "Variables": [[74, "variables"]], "Visualize the PyTorch profiler results": [[72, "visualize-the-pytorch-profiler-results"]], "WIP: Chunked context support on DeepSeek models": [[20, "wip-chunked-context-support-on-deepseek-models"]], "WIP: Enable more features by default": [[20, "wip-enable-more-features-by-default"]], "Weight Bindings": [[16, "weight-bindings"]], "Weight Loading": [[95, "weight-loading"]], "Weights absorb and MQA": [[28, "weights-absorb-and-mqa"]], "Welcome to TensorRT-LLM\u2019s Documentation!": [[64, null]], "What Can You Do With TensorRT-LLM?": [[71, "what-can-you-do-with-tensorrt-llm"]], "What Triggers an Event?": [[8, "what-triggers-an-event"]], "What is H100 FP8?": [[22, "what-is-h100-fp8"]], "What\u2019s coming next": [[25, "whats-coming-next"]], "When to Use Graph Rewriting?": [[7, "when-to-use-graph-rewriting"]], "WindowBlockManager/BlockManager": [[8, "windowblockmanager-blockmanager"]], "Workflow": [[17, "workflow"], [73, "workflow"]], "Workload Profile": [[26, "workload-profile"]], "World Configuration": [[6, "world-configuration"]], "XQA Optimization": [[5, "xqa-optimization"]], "bufferManager.h": [[1, "buffermanager-h"]], "cacheCommunicator.h": [[0, "cachecommunicator-h"]], "common.h": [[1, "common-h"]], "cudaEvent.h": [[1, "cudaevent-h"]], "cudaStream.h": [[1, "cudastream-h"]], "dataTransceiverState.h": [[0, "datatransceiverstate-h"]], "decoderState.h": [[1, "decoderstate-h"]], "decodingInput.h": [[1, "decodinginput-h"]], "decodingOutput.h": [[1, "decodingoutput-h"]], "disaggServerUtil.h": [[0, "disaggserverutil-h"]], "disaggregated": [[30, "trtllm-serve-disaggregated"]], "disaggregated_mpi_worker": [[30, "trtllm-serve-disaggregated-mpi-worker"]], "eagleBuffers.h": [[1, "eaglebuffers-h"]], "eagleModule.h": [[1, "eaglemodule-h"]], "executor.h": [[0, "executor-h"]], "explicitDraftTokensBuffers.h": [[1, "explicitdrafttokensbuffers-h"]], "gptDecoder.h": [[1, "gptdecoder-h"]], "gptDecoderBatched.h": [[1, "gptdecoderbatched-h"]], "gptJsonConfig.h": [[1, "gptjsonconfig-h"]], "iBuffer.h": [[1, "ibuffer-h"]], "iGptDecoderBatched.h": [[1, "igptdecoderbatched-h"]], "iTensor.h": [[1, "itensor-h"]], "ipcNvlsMemory.h": [[1, "ipcnvlsmemory-h"]], "ipcUtils.h": [[1, "ipcutils-h"]], "lookaheadBuffers.h": [[1, "lookaheadbuffers-h"]], "lookaheadModule.h": [[1, "lookaheadmodule-h"]], "loraCache.h": [[1, "loracache-h"]], "loraCachePageManagerConfig.h": [[1, "loracachepagemanagerconfig-h"]], "loraModule.h": [[1, "loramodule-h"]], "medusaModule.h": [[1, "medusamodule-h"]], "memoryCounters.h": [[1, "memorycounters-h"]], "modelConfig.h": [[1, "modelconfig-h"]], "promptTuningParams.h": [[1, "prompttuningparams-h"]], "rawEngine.h": [[1, "rawengine-h"]], "request.h": [[1, "request-h"]], "runtimeDefaults.h": [[1, "runtimedefaults-h"]], "samplingConfig.h": [[1, "samplingconfig-h"]], "serialization.h": [[0, "serialization-h"]], "serve": [[30, "trtllm-serve-serve"]], "speculativeDecodingMode.h": [[1, "speculativedecodingmode-h"]], "speculativeDecodingModule.h": [[1, "speculativedecodingmodule-h"]], "tensor.h": [[0, "tensor-h"]], "tllmLogger.h": [[1, "tllmlogger-h"]], "transferAgent.h": [[0, "transferagent-h"]], "trtllm-build": [[29, null]], "trtllm-serve": [[30, null], [30, "trtllm-serve"]], "types.h": [[0, "types-h"]], "worldConfig.h": [[1, "worldconfig-h"]]}, "docnames": ["_cpp_gen/executor", "_cpp_gen/runtime", "advanced/disaggregated-service", "advanced/executor", "advanced/expert-parallelism", "advanced/gpt-attention", "advanced/gpt-runtime", "advanced/graph-rewriting", "advanced/kv-cache-management", "advanced/kv-cache-reuse", "advanced/lora", "advanced/lowprecision-pcie-allreduce", "advanced/speculative-decoding", "advanced/weight-streaming", "architecture/add-model", "architecture/checkpoint", "architecture/core-concepts", "architecture/model-weights-loader", "architecture/overview", "architecture/workflow", "blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM", "blogs/Falcon180B-H200", "blogs/H100vsA100", "blogs/H200launch", "blogs/XQA-kernel", "blogs/quantization-in-TRT-LLM", "blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs", "blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization", "blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs", "commands/trtllm-build", "commands/trtllm-serve", "dev-on-cloud/build-image-to-dockerhub", "dev-on-cloud/dev-on-runpod", "examples/curl_chat_client", "examples/curl_chat_client_for_multimodal", "examples/curl_completion_client", "examples/customization", "examples/deepseek_r1_reasoning_parser", "examples/genai_perf_client", "examples/genai_perf_client_for_multimodal", "examples/index", "examples/llm_api_examples", "examples/llm_auto_parallel", "examples/llm_eagle2_decoding", "examples/llm_eagle_decoding", "examples/llm_guided_decoding", "examples/llm_inference", "examples/llm_inference_async", "examples/llm_inference_async_streaming", "examples/llm_inference_customize", "examples/llm_inference_distributed", "examples/llm_inference_kv_events", "examples/llm_logits_processor", "examples/llm_lookahead_decoding", "examples/llm_medusa_decoding", "examples/llm_mgmn_llm_distributed", "examples/llm_mgmn_trtllm_bench", "examples/llm_mgmn_trtllm_serve", "examples/llm_multilora", "examples/llm_quantization", "examples/openai_chat_client", "examples/openai_chat_client_for_multimodal", "examples/openai_completion_client", "examples/trtllm_serve_examples", "index", "installation/build-from-source-linux", "installation/grace-hopper", "installation/linux", "key-features", "llm-api/index", "llm-api/reference", "overview", "performance/perf-analysis", "performance/perf-benchmarking", "performance/perf-overview", "performance/performance-tuning-guide/benchmarking-default-performance", "performance/performance-tuning-guide/deciding-model-sharding-strategy", "performance/performance-tuning-guide/fp8-quantization", "performance/performance-tuning-guide/index", "performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens", "performance/performance-tuning-guide/useful-build-time-flags", "performance/performance-tuning-guide/useful-runtime-flags", "python-api/tensorrt_llm.functional", "python-api/tensorrt_llm.layers", "python-api/tensorrt_llm.models", "python-api/tensorrt_llm.plugin", "python-api/tensorrt_llm.quantization", "python-api/tensorrt_llm.runtime", "quick-start-guide", "reference/memory", "reference/precision", "reference/support-matrix", "reference/troubleshooting", "release-notes", "torch", "torch/adding_new_model", "torch/arch_overview", "torch/attention", "torch/kv_cache_manager", "torch/scheduler"], "envversion": {"sphinx": 62, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.todo": 2, "sphinx.ext.viewcode": 1}, "filenames": ["_cpp_gen/executor.rst", "_cpp_gen/runtime.rst", "advanced/disaggregated-service.md", "advanced/executor.md", "advanced/expert-parallelism.md", "advanced/gpt-attention.md", "advanced/gpt-runtime.md", "advanced/graph-rewriting.md", "advanced/kv-cache-management.md", "advanced/kv-cache-reuse.md", "advanced/lora.md", "advanced/lowprecision-pcie-allreduce.md", "advanced/speculative-decoding.md", "advanced/weight-streaming.md", "architecture/add-model.md", "architecture/checkpoint.md", "architecture/core-concepts.md", "architecture/model-weights-loader.md", "architecture/overview.md", "architecture/workflow.md", "blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md", "blogs/Falcon180B-H200.md", "blogs/H100vsA100.md", "blogs/H200launch.md", "blogs/XQA-kernel.md", "blogs/quantization-in-TRT-LLM.md", "blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md", "blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md", "blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md", "commands/trtllm-build.rst", "commands/trtllm-serve.rst", "dev-on-cloud/build-image-to-dockerhub.md", "dev-on-cloud/dev-on-runpod.md", "examples/curl_chat_client.rst", "examples/curl_chat_client_for_multimodal.rst", "examples/curl_completion_client.rst", "examples/customization.md", "examples/deepseek_r1_reasoning_parser.rst", "examples/genai_perf_client.rst", "examples/genai_perf_client_for_multimodal.rst", "examples/index.rst", "examples/llm_api_examples.rst", "examples/llm_auto_parallel.rst", "examples/llm_eagle2_decoding.rst", "examples/llm_eagle_decoding.rst", "examples/llm_guided_decoding.rst", "examples/llm_inference.rst", "examples/llm_inference_async.rst", "examples/llm_inference_async_streaming.rst", "examples/llm_inference_customize.rst", "examples/llm_inference_distributed.rst", "examples/llm_inference_kv_events.rst", "examples/llm_logits_processor.rst", "examples/llm_lookahead_decoding.rst", "examples/llm_medusa_decoding.rst", "examples/llm_mgmn_llm_distributed.rst", "examples/llm_mgmn_trtllm_bench.rst", "examples/llm_mgmn_trtllm_serve.rst", "examples/llm_multilora.rst", "examples/llm_quantization.rst", "examples/openai_chat_client.rst", "examples/openai_chat_client_for_multimodal.rst", "examples/openai_completion_client.rst", "examples/trtllm_serve_examples.rst", "index.rst", "installation/build-from-source-linux.md", "installation/grace-hopper.md", "installation/linux.md", "key-features.md", "llm-api/index.md", "llm-api/reference.rst", "overview.md", "performance/perf-analysis.md", "performance/perf-benchmarking.md", "performance/perf-overview.md", "performance/performance-tuning-guide/benchmarking-default-performance.md", "performance/performance-tuning-guide/deciding-model-sharding-strategy.md", "performance/performance-tuning-guide/fp8-quantization.md", "performance/performance-tuning-guide/index.rst", "performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.md", "performance/performance-tuning-guide/useful-build-time-flags.md", "performance/performance-tuning-guide/useful-runtime-flags.md", "python-api/tensorrt_llm.functional.rst", "python-api/tensorrt_llm.layers.rst", "python-api/tensorrt_llm.models.rst", "python-api/tensorrt_llm.plugin.rst", "python-api/tensorrt_llm.quantization.rst", "python-api/tensorrt_llm.runtime.rst", "quick-start-guide.md", "reference/memory.md", "reference/precision.md", "reference/support-matrix.md", "reference/troubleshooting.md", "release-notes.md", "torch.md", "torch/adding_new_model.md", "torch/arch_overview.md", "torch/attention.md", "torch/kv_cache_manager.md", "torch/scheduler.md"], "indexentries": {"--backend": [[30, "cmdoption-trtllm-serve-serve-backend", false]], "--cluster_size": [[30, "cmdoption-trtllm-serve-serve-cluster_size", false]], "--config_file": [[30, "cmdoption-trtllm-serve-disaggregated-c", false], [30, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", false]], "--ep_size": [[30, "cmdoption-trtllm-serve-serve-ep_size", false]], "--extra_llm_api_options": [[30, "cmdoption-trtllm-serve-serve-extra_llm_api_options", false]], "--gpus_per_node": [[30, "cmdoption-trtllm-serve-serve-gpus_per_node", false]], "--host": [[30, "cmdoption-trtllm-serve-serve-host", false]], "--kv_cache_free_gpu_memory_fraction": [[30, "cmdoption-trtllm-serve-serve-kv_cache_free_gpu_memory_fraction", false]], "--log_level": [[30, "cmdoption-trtllm-serve-disaggregated_mpi_worker-log_level", false], [30, "cmdoption-trtllm-serve-serve-log_level", false]], "--max_batch_size": [[30, "cmdoption-trtllm-serve-serve-max_batch_size", false]], "--max_beam_width": [[30, "cmdoption-trtllm-serve-serve-max_beam_width", false]], "--max_num_tokens": [[30, "cmdoption-trtllm-serve-serve-max_num_tokens", false]], "--max_seq_len": [[30, "cmdoption-trtllm-serve-serve-max_seq_len", false]], "--num_postprocess_workers": [[30, "cmdoption-trtllm-serve-serve-num_postprocess_workers", false]], "--port": [[30, "cmdoption-trtllm-serve-serve-port", false]], "--pp_size": [[30, "cmdoption-trtllm-serve-serve-pp_size", false]], "--reasoning_parser": [[30, "cmdoption-trtllm-serve-serve-reasoning_parser", false]], "--request_timeout": [[30, "cmdoption-trtllm-serve-disaggregated-r", false]], "--server_start_timeout": [[30, "cmdoption-trtllm-serve-disaggregated-t", false]], "--tokenizer": [[30, "cmdoption-trtllm-serve-serve-tokenizer", false]], "--tp_size": [[30, "cmdoption-trtllm-serve-serve-tp_size", false]], "--trust_remote_code": [[30, "cmdoption-trtllm-serve-serve-trust_remote_code", false]], "-c": [[30, "cmdoption-trtllm-serve-disaggregated-c", false], [30, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", false]], "-r": [[30, "cmdoption-trtllm-serve-disaggregated-r", false]], "-t": [[30, "cmdoption-trtllm-serve-disaggregated-t", false]], "__init__() (tensorrt_llm.llmapi.buildcacheconfig method)": [[70, "tensorrt_llm.llmapi.BuildCacheConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.buildconfig method)": [[70, "tensorrt_llm.llmapi.BuildConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.completionoutput method)": [[70, "tensorrt_llm.llmapi.CompletionOutput.__init__", false]], "__init__() (tensorrt_llm.llmapi.disaggregatedparams method)": [[70, "tensorrt_llm.llmapi.DisaggregatedParams.__init__", false]], "__init__() (tensorrt_llm.llmapi.guideddecodingparams method)": [[70, "tensorrt_llm.llmapi.GuidedDecodingParams.__init__", false]], "__init__() (tensorrt_llm.llmapi.kvcacheretentionconfig method)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig method)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.llm method)": [[70, "tensorrt_llm.llmapi.LLM.__init__", false]], "__init__() (tensorrt_llm.llmapi.lookaheaddecodingconfig method)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.mpicommsession method)": [[70, "tensorrt_llm.llmapi.MpiCommSession.__init__", false]], "__init__() (tensorrt_llm.llmapi.quantconfig method)": [[70, "tensorrt_llm.llmapi.QuantConfig.__init__", false]], "__init__() (tensorrt_llm.llmapi.requestoutput method)": [[70, "tensorrt_llm.llmapi.RequestOutput.__init__", false]], "__init__() (tensorrt_llm.llmapi.samplingparams method)": [[70, "tensorrt_llm.llmapi.SamplingParams.__init__", false]], "abort() (tensorrt_llm.llmapi.mpicommsession method)": [[70, "tensorrt_llm.llmapi.MpiCommSession.abort", false]], "abs() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.abs", false]], "abs() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.abs", false]], "activation() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.activation", false]], "adalayernorm (class in tensorrt_llm.layers.normalization)": [[83, "tensorrt_llm.layers.normalization.AdaLayerNorm", false]], "adalayernormcontinuous (class in tensorrt_llm.layers.normalization)": [[83, "tensorrt_llm.layers.normalization.AdaLayerNormContinuous", false]], "adalayernormzero (class in tensorrt_llm.layers.normalization)": [[83, "tensorrt_llm.layers.normalization.AdaLayerNormZero", false]], "adalayernormzerosingle (class in tensorrt_llm.layers.normalization)": [[83, "tensorrt_llm.layers.normalization.AdaLayerNormZeroSingle", false]], "add() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.add", false]], "add_input() (tensorrt_llm.functional.conditional method)": [[82, "tensorrt_llm.functional.Conditional.add_input", false]], "add_output() (tensorrt_llm.functional.conditional method)": [[82, "tensorrt_llm.functional.Conditional.add_output", false]], "add_sequence() (tensorrt_llm.runtime.kvcachemanager method)": [[87, "tensorrt_llm.runtime.KVCacheManager.add_sequence", false]], "add_special_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.add_special_tokens", false]], "additional_model_outputs (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.additional_model_outputs", false]], "alibi (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.alibi", false]], "alibi_with_scale (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.alibi_with_scale", false]], "allgather() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.allgather", false]], "allreduce() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.allreduce", false]], "allreducefusionop (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.AllReduceFusionOp", false]], "allreduceparams (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.AllReduceParams", false]], "allreducestrategy (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.AllReduceStrategy", false]], "apply_batched_logits_processor (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.apply_batched_logits_processor", false]], "apply_llama3_scaling() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.apply_llama3_scaling", false]], "apply_rotary_pos_emb() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.apply_rotary_pos_emb", false]], "apply_rotary_pos_emb_chatglm() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.apply_rotary_pos_emb_chatglm", false]], "apply_rotary_pos_emb_cogvlm() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.apply_rotary_pos_emb_cogvlm", false]], "arange() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.arange", false]], "argmax() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.argmax", false]], "assert_valid_quant_algo() (tensorrt_llm.models.gemmaforcausallm class method)": [[84, "tensorrt_llm.models.GemmaForCausalLM.assert_valid_quant_algo", false]], "assertion() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.assertion", false]], "attention (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.Attention", false]], "attentionmaskparams (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.AttentionMaskParams", false]], "attentionmasktype (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.AttentionMaskType", false]], "attentionparams (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.AttentionParams", false]], "attn_backend (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.attn_backend", false]], "attn_processors (tensorrt_llm.models.sd3transformer2dmodel property)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.attn_processors", false]], "audio_engine_dir (tensorrt_llm.runtime.multimodalmodelrunner property)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.audio_engine_dir", false]], "auto (tensorrt_llm.functional.allreducestrategy attribute)": [[82, "tensorrt_llm.functional.AllReduceStrategy.AUTO", false]], "auto_deploy_config (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.auto_deploy_config", false]], "auto_parallel (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel", false]], "auto_parallel_config (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.auto_parallel_config", false]], "auto_parallel_config (tensorrt_llm.llmapi.trtllmargs property)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_config", false]], "auto_parallel_world_size (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.auto_parallel_world_size", false]], "autotuner_enabled (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.autotuner_enabled", false]], "avg_pool2d() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.avg_pool2d", false]], "avgpool2d (class in tensorrt_llm.layers.pooling)": [[83, "tensorrt_llm.layers.pooling.AvgPool2d", false]], "axes (tensorrt_llm.functional.sliceinputtype attribute)": [[82, "tensorrt_llm.functional.SliceInputType.axes", false]], "bad (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.bad", false]], "bad_token_ids (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.bad_token_ids", false]], "bad_words_list (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.bad_words_list", false]], "baichuanforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.BaichuanForCausalLM", false]], "batch_size (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.batch_size", false]], "batchingtype (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.BatchingType", false]], "beam_search_diversity_rate (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.beam_search_diversity_rate", false]], "beam_search_diversity_rate (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.beam_search_diversity_rate", false]], "beam_width_array (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.beam_width_array", false]], "bert_attention() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.bert_attention", false]], "bertattention (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.BertAttention", false]], "bertforquestionanswering (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.BertForQuestionAnswering", false]], "bertforsequenceclassification (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.BertForSequenceClassification", false]], "bertmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.BertModel", false]], "best_of (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.best_of", false]], "bidirectional (tensorrt_llm.functional.attentionmasktype attribute)": [[82, "tensorrt_llm.functional.AttentionMaskType.bidirectional", false]], "bidirectionalglm (tensorrt_llm.functional.attentionmasktype attribute)": [[82, "tensorrt_llm.functional.AttentionMaskType.bidirectionalglm", false]], "blocksparse (tensorrt_llm.functional.attentionmasktype attribute)": [[82, "tensorrt_llm.functional.AttentionMaskType.blocksparse", false]], "blocksparseattnparams (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.BlockSparseAttnParams", false]], "bloomforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.BloomForCausalLM", false]], "bloommodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.BloomModel", false]], "broadcast_helper() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.broadcast_helper", false]], "buffer_allocated (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.buffer_allocated", false]], "build_config (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.build_config", false]], "build_config (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.build_config", false]], "buildcacheconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.BuildCacheConfig", false]], "buildconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.BuildConfig", false]], "cache_root (tensorrt_llm.llmapi.buildcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildCacheConfig.cache_root", false]], "cache_root (tensorrt_llm.llmapi.buildcacheconfig property)": [[70, "id7", false]], "cachetransceiverconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.CacheTransceiverConfig", false]], "calculate_speculative_resource() (tensorrt_llm.llmapi.lookaheaddecodingconfig method)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig.calculate_speculative_resource", false]], "calib_batch_size (tensorrt_llm.llmapi.calibconfig attribute)": [[70, "tensorrt_llm.llmapi.CalibConfig.calib_batch_size", false]], "calib_batches (tensorrt_llm.llmapi.calibconfig attribute)": [[70, "tensorrt_llm.llmapi.CalibConfig.calib_batches", false]], "calib_config (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.calib_config", false]], "calib_dataset (tensorrt_llm.llmapi.calibconfig attribute)": [[70, "tensorrt_llm.llmapi.CalibConfig.calib_dataset", false]], "calib_max_seq_length (tensorrt_llm.llmapi.calibconfig attribute)": [[70, "tensorrt_llm.llmapi.CalibConfig.calib_max_seq_length", false]], "calibconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.CalibConfig", false]], "capacity_scheduler_policy (tensorrt_llm.llmapi.schedulerconfig attribute)": [[70, "tensorrt_llm.llmapi.SchedulerConfig.capacity_scheduler_policy", false]], "capacityschedulerpolicy (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.CapacitySchedulerPolicy", false]], "cast (class in tensorrt_llm.layers.cast)": [[83, "tensorrt_llm.layers.cast.Cast", false]], "cast() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.cast", false]], "cast() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.cast", false]], "categorical_sample() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.categorical_sample", false]], "causal (tensorrt_llm.functional.attentionmasktype attribute)": [[82, "tensorrt_llm.functional.AttentionMaskType.causal", false]], "chatglm (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.chatglm", false]], "chatglmconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.ChatGLMConfig", false]], "chatglmforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.ChatGLMForCausalLM", false]], "chatglmgenerationsession (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.ChatGLMGenerationSession", false]], "chatglmmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.ChatGLMModel", false]], "check_config() (tensorrt_llm.models.decodermodel method)": [[84, "tensorrt_llm.models.DecoderModel.check_config", false]], "check_config() (tensorrt_llm.models.dit method)": [[84, "tensorrt_llm.models.DiT.check_config", false]], "check_config() (tensorrt_llm.models.encodermodel method)": [[84, "tensorrt_llm.models.EncoderModel.check_config", false]], "check_config() (tensorrt_llm.models.falconforcausallm method)": [[84, "tensorrt_llm.models.FalconForCausalLM.check_config", false]], "check_config() (tensorrt_llm.models.mptforcausallm method)": [[84, "tensorrt_llm.models.MPTForCausalLM.check_config", false]], "check_config() (tensorrt_llm.models.optforcausallm method)": [[84, "tensorrt_llm.models.OPTForCausalLM.check_config", false]], "check_config() (tensorrt_llm.models.phiforcausallm method)": [[84, "tensorrt_llm.models.PhiForCausalLM.check_config", false]], "check_config() (tensorrt_llm.models.pretrainedmodel method)": [[84, "tensorrt_llm.models.PretrainedModel.check_config", false]], "choices() (tensorrt_llm.functional.positionembeddingtype static method)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.choices", false]], "chunk() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.chunk", false]], "clamp_val (tensorrt_llm.llmapi.quantconfig attribute)": [[70, "tensorrt_llm.llmapi.QuantConfig.clamp_val", false]], "clip() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.clip", false]], "clipvisiontransformer (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.CLIPVisionTransformer", false]], "cogvlmattention (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.CogVLMAttention", false]], "cogvlmconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.CogVLMConfig", false]], "cogvlmforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.CogVLMForCausalLM", false]], "cohereforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.CohereForCausalLM", false]], "collect_and_bias() (tensorrt_llm.layers.linear.linear method)": [[83, "tensorrt_llm.layers.linear.Linear.collect_and_bias", false]], "collect_and_bias() (tensorrt_llm.layers.linear.linearbase method)": [[83, "tensorrt_llm.layers.linear.LinearBase.collect_and_bias", false]], "collect_and_bias() (tensorrt_llm.layers.linear.rowlinear method)": [[83, "tensorrt_llm.layers.linear.RowLinear.collect_and_bias", false]], "columnlinear (in module tensorrt_llm.layers.linear)": [[83, "tensorrt_llm.layers.linear.ColumnLinear", false]], "combinedtimesteplabelembeddings (class in tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.CombinedTimestepLabelEmbeddings", false]], "combinedtimesteptextprojembeddings (class in tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.CombinedTimestepTextProjEmbeddings", false]], "completionoutput (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.CompletionOutput", false]], "compute_relative_bias() (in module tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.compute_relative_bias", false]], "concat() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.concat", false]], "conditional (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.Conditional", false]], "config_class (tensorrt_llm.models.baichuanforcausallm attribute)": [[84, "tensorrt_llm.models.BaichuanForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.chatglmforcausallm attribute)": [[84, "tensorrt_llm.models.ChatGLMForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.cogvlmforcausallm attribute)": [[84, "tensorrt_llm.models.CogVLMForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.cohereforcausallm attribute)": [[84, "tensorrt_llm.models.CohereForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.dbrxforcausallm attribute)": [[84, "tensorrt_llm.models.DbrxForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.deepseekforcausallm attribute)": [[84, "tensorrt_llm.models.DeepseekForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.deepseekv2forcausallm attribute)": [[84, "tensorrt_llm.models.DeepseekV2ForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.eagleforcausallm attribute)": [[84, "tensorrt_llm.models.EagleForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.falconforcausallm attribute)": [[84, "tensorrt_llm.models.FalconForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.gemmaforcausallm attribute)": [[84, "tensorrt_llm.models.GemmaForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.gptforcausallm attribute)": [[84, "tensorrt_llm.models.GPTForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.gptjforcausallm attribute)": [[84, "tensorrt_llm.models.GPTJForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.llamaforcausallm attribute)": [[84, "tensorrt_llm.models.LLaMAForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.mambaforcausallm attribute)": [[84, "tensorrt_llm.models.MambaForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.medusaforcausallm attribute)": [[84, "tensorrt_llm.models.MedusaForCausalLm.config_class", false]], "config_class (tensorrt_llm.models.mllamaforcausallm attribute)": [[84, "tensorrt_llm.models.MLLaMAForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.phi3forcausallm attribute)": [[84, "tensorrt_llm.models.Phi3ForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.phiforcausallm attribute)": [[84, "tensorrt_llm.models.PhiForCausalLM.config_class", false]], "config_class (tensorrt_llm.models.sd3transformer2dmodel attribute)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.config_class", false]], "constant() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.constant", false]], "constant_to_tensor_() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.constant_to_tensor_", false]], "constants_to_tensors_() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.constants_to_tensors_", false]], "context (tensorrt_llm.runtime.session property)": [[87, "tensorrt_llm.runtime.Session.context", false]], "context_chunking_policy (tensorrt_llm.llmapi.schedulerconfig attribute)": [[70, "tensorrt_llm.llmapi.SchedulerConfig.context_chunking_policy", false]], "context_logits (tensorrt_llm.llmapi.requestoutput attribute)": [[70, "tensorrt_llm.llmapi.RequestOutput.context_logits", false]], "context_mem_size (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.context_mem_size", false]], "context_mem_size (tensorrt_llm.runtime.session property)": [[87, "tensorrt_llm.runtime.Session.context_mem_size", false]], "contextchunkingpolicy (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.ContextChunkingPolicy", false]], "conv1d (class in tensorrt_llm.layers.conv)": [[83, "tensorrt_llm.layers.conv.Conv1d", false]], "conv1d() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.conv1d", false]], "conv2d (class in tensorrt_llm.layers.conv)": [[83, "tensorrt_llm.layers.conv.Conv2d", false]], "conv2d() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.conv2d", false]], "conv3d (class in tensorrt_llm.layers.conv)": [[83, "tensorrt_llm.layers.conv.Conv3d", false]], "conv3d() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.conv3d", false]], "conv_kernel (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.conv_kernel", false]], "conv_kernel (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.conv_kernel", false]], "conv_transpose2d() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.conv_transpose2d", false]], "convert_load_format() (tensorrt_llm.llmapi.torchllmargs class method)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.convert_load_format", false]], "convtranspose2d (class in tensorrt_llm.layers.conv)": [[83, "tensorrt_llm.layers.conv.ConvTranspose2d", false]], "copy_on_partial_reuse (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.copy_on_partial_reuse", false]], "cos() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.cos", false]], "cp_split_plugin() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.cp_split_plugin", false]], "cpp_e2e (tensorrt_llm.runtime.multimodalmodelrunner property)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.cpp_e2e", false]], "cpp_llm_only (tensorrt_llm.runtime.multimodalmodelrunner property)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.cpp_llm_only", false]], "create_allreduce_plugin() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.create_allreduce_plugin", false]], "create_attention_const_params() (tensorrt_llm.layers.attention.attention static method)": [[83, "tensorrt_llm.layers.attention.Attention.create_attention_const_params", false]], "create_fake_weight() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.create_fake_weight", false]], "create_runtime_defaults() (tensorrt_llm.models.pretrainedconfig static method)": [[84, "tensorrt_llm.models.PretrainedConfig.create_runtime_defaults", false]], "create_sinusoidal_positions() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions", false]], "create_sinusoidal_positions_for_attention_plugin() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin", false]], "create_sinusoidal_positions_for_cogvlm_attention_plugin() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions_for_cogvlm_attention_plugin", false]], "create_sinusoidal_positions_long_rope() (tensorrt_llm.functional.ropeembeddingutils method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions_long_rope", false]], "create_sinusoidal_positions_yarn() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.create_sinusoidal_positions_yarn", false]], "cropped_pos_embed() (tensorrt_llm.layers.embedding.sd3patchembed method)": [[83, "tensorrt_llm.layers.embedding.SD3PatchEmbed.cropped_pos_embed", false]], "cross_attention (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.cross_attention", false]], "cross_attention (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.cross_attention", false]], "cross_kv_cache_fraction (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.cross_kv_cache_fraction", false]], "ctx_request_id (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[70, "tensorrt_llm.llmapi.DisaggregatedParams.ctx_request_id", false]], "cuda_graph_batch_sizes (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_batch_sizes", false]], "cuda_graph_cache_size (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[70, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_cache_size", false]], "cuda_graph_max_batch_size (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_max_batch_size", false]], "cuda_graph_mode (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[70, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_mode", false]], "cuda_graph_mode (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.cuda_graph_mode", false]], "cuda_graph_padding_enabled (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.cuda_graph_padding_enabled", false]], "cuda_stream_guard() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.cuda_stream_guard", false]], "cuda_stream_sync() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.cuda_stream_sync", false]], "cumsum() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.cumsum", false]], "cumulative_logprob (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.cumulative_logprob", false]], "custom_mask (tensorrt_llm.functional.attentionmasktype attribute)": [[82, "tensorrt_llm.functional.AttentionMaskType.custom_mask", false]], "data (tensorrt_llm.functional.sliceinputtype attribute)": [[82, "tensorrt_llm.functional.SliceInputType.data", false]], "dbrxconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.DbrxConfig", false]], "dbrxforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.DbrxForCausalLM", false]], "debug_mode (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.debug_mode", false]], "debug_tensors_to_save (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.debug_tensors_to_save", false]], "decode() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.decode", false]], "decode_batch() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.decode_batch", false]], "decode_duration_ms (tensorrt_llm.llmapi.kvcacheretentionconfig property)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_duration_ms", false]], "decode_regular() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.decode_regular", false]], "decode_retention_priority (tensorrt_llm.llmapi.kvcacheretentionconfig property)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_retention_priority", false]], "decode_stream() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.decode_stream", false]], "decode_words_list() (in module tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.decode_words_list", false]], "decodermodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.DecoderModel", false]], "decoding_config (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.decoding_config", false]], "decoding_config (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.decoding_config", false]], "decoding_type (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.decoding_type", false]], "decoding_type (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig.decoding_type", false]], "decoding_type (tensorrt_llm.llmapi.medusadecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MedusaDecodingConfig.decoding_type", false]], "decoding_type (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MTPDecodingConfig.decoding_type", false]], "decoding_type (tensorrt_llm.llmapi.ngramdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.NGramDecodingConfig.decoding_type", false]], "deepseekforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.DeepseekForCausalLM", false]], "deepseekv2attention (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.DeepseekV2Attention", false]], "deepseekv2forcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.DeepseekV2ForCausalLM", false]], "default_plugin_config() (tensorrt_llm.models.cogvlmforcausallm method)": [[84, "tensorrt_llm.models.CogVLMForCausalLM.default_plugin_config", false]], "default_plugin_config() (tensorrt_llm.models.llamaforcausallm method)": [[84, "tensorrt_llm.models.LLaMAForCausalLM.default_plugin_config", false]], "deferred (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.deferred", false]], "detokenize (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.detokenize", false]], "device (tensorrt_llm.llmapi.calibconfig attribute)": [[70, "tensorrt_llm.llmapi.CalibConfig.device", false]], "device (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.device", false]], "diffusersattention (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.DiffusersAttention", false]], "dimrange (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.DimRange", false]], "directory (tensorrt_llm.llmapi.kvcacheretentionconfig property)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.directory", false]], "disable (tensorrt_llm.functional.sidestreamidtype attribute)": [[82, "tensorrt_llm.functional.SideStreamIDType.disable", false]], "disable_forward_chunking() (tensorrt_llm.models.sd3transformer2dmodel method)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.disable_forward_chunking", false]], "disable_overlap_scheduler (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.disable_overlap_scheduler", false]], "disaggregated_params (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.disaggregated_params", false]], "disaggregatedparams (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.DisaggregatedParams", false]], "dit (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.DiT", false]], "div() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.div", false]], "dora_plugin() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.dora_plugin", false]], "draft_tokens (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[70, "tensorrt_llm.llmapi.DisaggregatedParams.draft_tokens", false]], "draft_tokens_external (tensorrt_llm.models.speculativedecodingmode attribute)": [[84, "tensorrt_llm.models.SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL", false]], "dry_run (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.dry_run", false]], "dtype (tensorrt_llm.functional.tensor property)": [[82, "tensorrt_llm.functional.Tensor.dtype", false]], "dtype (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.dtype", false]], "dtype (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.dtype", false]], "dtype (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.dtype", false]], "dtype (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.dtype", false]], "dtype (tensorrt_llm.runtime.tensorinfo attribute)": [[87, "tensorrt_llm.runtime.TensorInfo.dtype", false]], "dump_debug_buffers() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.dump_debug_buffers", false]], "duration_ms (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig property)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.duration_ms", false]], "dynamic (tensorrt_llm.functional.rotaryscalingtype attribute)": [[82, "tensorrt_llm.functional.RotaryScalingType.dynamic", false]], "dynamic_batch_config (tensorrt_llm.llmapi.schedulerconfig attribute)": [[70, "tensorrt_llm.llmapi.SchedulerConfig.dynamic_batch_config", false]], "dynamic_batch_moving_average_window (tensorrt_llm.llmapi.dynamicbatchconfig attribute)": [[70, "tensorrt_llm.llmapi.DynamicBatchConfig.dynamic_batch_moving_average_window", false]], "dynamic_tree_max_topk (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.dynamic_tree_max_topK", false]], "dynamicbatchconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.DynamicBatchConfig", false]], "eagle (tensorrt_llm.models.speculativedecodingmode attribute)": [[84, "tensorrt_llm.models.SpeculativeDecodingMode.EAGLE", false]], "eagle3_one_model (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.eagle3_one_model", false]], "eagle_choices (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.eagle_choices", false]], "eagledecodingconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig", false]], "eagleforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.EagleForCausalLM", false]], "early_stop_criteria() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.early_stop_criteria", false]], "early_stopping (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.early_stopping", false]], "early_stopping (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.early_stopping", false]], "einsum() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.einsum", false]], "elementwise_binary() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.elementwise_binary", false]], "embedding (class in tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.Embedding", false]], "embedding() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.embedding", false]], "embedding_bias (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.embedding_bias", false]], "embedding_parallel_mode (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.embedding_parallel_mode", false]], "enable_batch_size_tuning (tensorrt_llm.llmapi.dynamicbatchconfig attribute)": [[70, "tensorrt_llm.llmapi.DynamicBatchConfig.enable_batch_size_tuning", false]], "enable_block_reuse (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.enable_block_reuse", false]], "enable_build_cache (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.enable_build_cache", false]], "enable_context_fmha_fp32_acc (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[70, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.enable_context_fmha_fp32_acc", false]], "enable_debug_output (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.enable_debug_output", false]], "enable_forward_chunking() (tensorrt_llm.models.sd3transformer2dmodel method)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.enable_forward_chunking", false]], "enable_iter_perf_stats (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_perf_stats", false]], "enable_iter_req_stats (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.enable_iter_req_stats", false]], "enable_layerwise_nvtx_marker (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.enable_layerwise_nvtx_marker", false]], "enable_max_num_tokens_tuning (tensorrt_llm.llmapi.dynamicbatchconfig attribute)": [[70, "tensorrt_llm.llmapi.DynamicBatchConfig.enable_max_num_tokens_tuning", false]], "enable_min_latency (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.enable_min_latency", false]], "enable_partial_reuse (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.enable_partial_reuse", false]], "enable_tqdm (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.enable_tqdm", false]], "enable_trtllm_sampler (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.enable_trtllm_sampler", false]], "encdecmodelrunner (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.EncDecModelRunner", false]], "encoder_run() (tensorrt_llm.runtime.encdecmodelrunner method)": [[87, "tensorrt_llm.runtime.EncDecModelRunner.encoder_run", false]], "encodermodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.EncoderModel", false]], "end_id (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.end_id", false]], "end_id (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.end_id", false]], "engine (tensorrt_llm.runtime.session property)": [[87, "tensorrt_llm.runtime.Session.engine", false]], "engine_inspector (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.engine_inspector", false]], "eq() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.eq", false]], "equal_progress (tensorrt_llm.llmapi.contextchunkingpolicy attribute)": [[70, "tensorrt_llm.llmapi.ContextChunkingPolicy.EQUAL_PROGRESS", false]], "event_buffer_max_size (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.event_buffer_max_size", false]], "exclude_input_from_output (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.exclude_input_from_output", false]], "exclude_modules (tensorrt_llm.llmapi.quantconfig attribute)": [[70, "tensorrt_llm.llmapi.QuantConfig.exclude_modules", false]], "exp() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.exp", false]], "expand() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.expand", false]], "expand_dims() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.expand_dims", false]], "expand_dims_like() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.expand_dims_like", false]], "expand_mask() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.expand_mask", false]], "explicit_draft_tokens (tensorrt_llm.models.speculativedecodingmode attribute)": [[84, "tensorrt_llm.models.SpeculativeDecodingMode.EXPLICIT_DRAFT_TOKENS", false]], "extended_runtime_perf_knob_config (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.extended_runtime_perf_knob_config", false]], "extendedruntimeperfknobconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig", false]], "extra_resource_managers (tensorrt_llm.llmapi.torchllmargs property)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.extra_resource_managers", false]], "falconconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.FalconConfig", false]], "falconforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.FalconForCausalLM", false]], "falconmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.FalconModel", false]], "fast_build (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.fast_build", false]], "fc_gate() (tensorrt_llm.layers.mlp.fusedgatedmlp method)": [[83, "tensorrt_llm.layers.mlp.FusedGatedMLP.fc_gate", false]], "fc_gate_dora() (in module tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.fc_gate_dora", false]], "fc_gate_lora() (in module tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.fc_gate_lora", false]], "fc_gate_plugin() (tensorrt_llm.layers.mlp.fusedgatedmlp method)": [[83, "tensorrt_llm.layers.mlp.FusedGatedMLP.fc_gate_plugin", false]], "field_name (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "id12", false], [70, "id15", false], [70, "id18", false], [70, "tensorrt_llm.llmapi.TorchLlmArgs.field_name", false]], "field_name (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "id21", false], [70, "id24", false], [70, "id27", false], [70, "id30", false], [70, "id33", false], [70, "tensorrt_llm.llmapi.TrtLlmArgs.field_name", false]], "fill_attention_const_params_for_long_rope() (tensorrt_llm.layers.attention.attentionparams method)": [[83, "tensorrt_llm.layers.attention.AttentionParams.fill_attention_const_params_for_long_rope", false]], "fill_attention_const_params_for_rope() (tensorrt_llm.layers.attention.attentionparams method)": [[83, "tensorrt_llm.layers.attention.AttentionParams.fill_attention_const_params_for_rope", false]], "fill_attention_params() (tensorrt_llm.layers.attention.attention static method)": [[83, "tensorrt_llm.layers.attention.Attention.fill_attention_params", false]], "fill_none_tensor_list() (tensorrt_llm.layers.attention.keyvaluecacheparams method)": [[83, "tensorrt_llm.layers.attention.KeyValueCacheParams.fill_none_tensor_list", false]], "fill_value (tensorrt_llm.functional.sliceinputtype attribute)": [[82, "tensorrt_llm.functional.SliceInputType.fill_value", false]], "filter_medusa_logits() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.filter_medusa_logits", false]], "finalize_decoder() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.finalize_decoder", false]], "find_best_medusa_path() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.find_best_medusa_path", false]], "finish_reason (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.finish_reason", false]], "finished (tensorrt_llm.llmapi.requestoutput attribute)": [[70, "tensorrt_llm.llmapi.RequestOutput.finished", false]], "first_come_first_served (tensorrt_llm.llmapi.contextchunkingpolicy attribute)": [[70, "tensorrt_llm.llmapi.ContextChunkingPolicy.FIRST_COME_FIRST_SERVED", false]], "first_gen_tokens (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[70, "tensorrt_llm.llmapi.DisaggregatedParams.first_gen_tokens", false]], "first_layer (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.first_layer", false]], "flatten() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.flatten", false]], "flatten() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.flatten", false]], "flip() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.flip", false]], "floordiv() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.floordiv", false]], "fmt_dim (c macro)": [[1, "c.FMT_DIM", false]], "for_each_rank() (tensorrt_llm.models.pretrainedconfig method)": [[84, "tensorrt_llm.models.PretrainedConfig.for_each_rank", false]], "force_num_profiles (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.force_num_profiles", false]], "forward() (tensorrt_llm.layers.activation.mish method)": [[83, "tensorrt_llm.layers.activation.Mish.forward", false]], "forward() (tensorrt_llm.layers.attention.attention method)": [[83, "tensorrt_llm.layers.attention.Attention.forward", false]], "forward() (tensorrt_llm.layers.attention.bertattention method)": [[83, "tensorrt_llm.layers.attention.BertAttention.forward", false]], "forward() (tensorrt_llm.layers.attention.cogvlmattention method)": [[83, "tensorrt_llm.layers.attention.CogVLMAttention.forward", false]], "forward() (tensorrt_llm.layers.attention.deepseekv2attention method)": [[83, "tensorrt_llm.layers.attention.DeepseekV2Attention.forward", false]], "forward() (tensorrt_llm.layers.attention.diffusersattention method)": [[83, "tensorrt_llm.layers.attention.DiffusersAttention.forward", false]], "forward() (tensorrt_llm.layers.cast.cast method)": [[83, "tensorrt_llm.layers.cast.Cast.forward", false]], "forward() (tensorrt_llm.layers.conv.conv1d method)": [[83, "tensorrt_llm.layers.conv.Conv1d.forward", false]], "forward() (tensorrt_llm.layers.conv.conv2d method)": [[83, "tensorrt_llm.layers.conv.Conv2d.forward", false]], "forward() (tensorrt_llm.layers.conv.conv3d method)": [[83, "tensorrt_llm.layers.conv.Conv3d.forward", false]], "forward() (tensorrt_llm.layers.conv.convtranspose2d method)": [[83, "tensorrt_llm.layers.conv.ConvTranspose2d.forward", false]], "forward() (tensorrt_llm.layers.embedding.combinedtimesteplabelembeddings method)": [[83, "tensorrt_llm.layers.embedding.CombinedTimestepLabelEmbeddings.forward", false]], "forward() (tensorrt_llm.layers.embedding.combinedtimesteptextprojembeddings method)": [[83, "tensorrt_llm.layers.embedding.CombinedTimestepTextProjEmbeddings.forward", false]], "forward() (tensorrt_llm.layers.embedding.embedding method)": [[83, "tensorrt_llm.layers.embedding.Embedding.forward", false]], "forward() (tensorrt_llm.layers.embedding.labelembedding method)": [[83, "tensorrt_llm.layers.embedding.LabelEmbedding.forward", false]], "forward() (tensorrt_llm.layers.embedding.pixartalphatextprojection method)": [[83, "tensorrt_llm.layers.embedding.PixArtAlphaTextProjection.forward", false]], "forward() (tensorrt_llm.layers.embedding.prompttuningembedding method)": [[83, "tensorrt_llm.layers.embedding.PromptTuningEmbedding.forward", false]], "forward() (tensorrt_llm.layers.embedding.sd3patchembed method)": [[83, "tensorrt_llm.layers.embedding.SD3PatchEmbed.forward", false]], "forward() (tensorrt_llm.layers.embedding.timestepembedding method)": [[83, "tensorrt_llm.layers.embedding.TimestepEmbedding.forward", false]], "forward() (tensorrt_llm.layers.embedding.timesteps method)": [[83, "tensorrt_llm.layers.embedding.Timesteps.forward", false]], "forward() (tensorrt_llm.layers.linear.linearbase method)": [[83, "tensorrt_llm.layers.linear.LinearBase.forward", false]], "forward() (tensorrt_llm.layers.mlp.fusedgatedmlp method)": [[83, "tensorrt_llm.layers.mlp.FusedGatedMLP.forward", false]], "forward() (tensorrt_llm.layers.mlp.gatedmlp method)": [[83, "tensorrt_llm.layers.mlp.GatedMLP.forward", false]], "forward() (tensorrt_llm.layers.mlp.linearactivation method)": [[83, "tensorrt_llm.layers.mlp.LinearActivation.forward", false]], "forward() (tensorrt_llm.layers.mlp.linearapproximategelu method)": [[83, "tensorrt_llm.layers.mlp.LinearApproximateGELU.forward", false]], "forward() (tensorrt_llm.layers.mlp.lineargeglu method)": [[83, "tensorrt_llm.layers.mlp.LinearGEGLU.forward", false]], "forward() (tensorrt_llm.layers.mlp.lineargelu method)": [[83, "tensorrt_llm.layers.mlp.LinearGELU.forward", false]], "forward() (tensorrt_llm.layers.mlp.linearswiglu method)": [[83, "tensorrt_llm.layers.mlp.LinearSwiGLU.forward", false]], "forward() (tensorrt_llm.layers.mlp.mlp method)": [[83, "tensorrt_llm.layers.mlp.MLP.forward", false]], "forward() (tensorrt_llm.layers.normalization.adalayernorm method)": [[83, "tensorrt_llm.layers.normalization.AdaLayerNorm.forward", false]], "forward() (tensorrt_llm.layers.normalization.adalayernormcontinuous method)": [[83, "tensorrt_llm.layers.normalization.AdaLayerNormContinuous.forward", false]], "forward() (tensorrt_llm.layers.normalization.adalayernormzero method)": [[83, "tensorrt_llm.layers.normalization.AdaLayerNormZero.forward", false]], "forward() (tensorrt_llm.layers.normalization.adalayernormzerosingle method)": [[83, "tensorrt_llm.layers.normalization.AdaLayerNormZeroSingle.forward", false]], "forward() (tensorrt_llm.layers.normalization.groupnorm method)": [[83, "tensorrt_llm.layers.normalization.GroupNorm.forward", false]], "forward() (tensorrt_llm.layers.normalization.layernorm method)": [[83, "tensorrt_llm.layers.normalization.LayerNorm.forward", false]], "forward() (tensorrt_llm.layers.normalization.rmsnorm method)": [[83, "tensorrt_llm.layers.normalization.RmsNorm.forward", false]], "forward() (tensorrt_llm.layers.normalization.sd35adalayernormzerox method)": [[83, "tensorrt_llm.layers.normalization.SD35AdaLayerNormZeroX.forward", false]], "forward() (tensorrt_llm.layers.pooling.avgpool2d method)": [[83, "tensorrt_llm.layers.pooling.AvgPool2d.forward", false]], "forward() (tensorrt_llm.models.bertforquestionanswering method)": [[84, "tensorrt_llm.models.BertForQuestionAnswering.forward", false]], "forward() (tensorrt_llm.models.bertforsequenceclassification method)": [[84, "tensorrt_llm.models.BertForSequenceClassification.forward", false]], "forward() (tensorrt_llm.models.bertmodel method)": [[84, "tensorrt_llm.models.BertModel.forward", false]], "forward() (tensorrt_llm.models.bloommodel method)": [[84, "tensorrt_llm.models.BloomModel.forward", false]], "forward() (tensorrt_llm.models.chatglmmodel method)": [[84, "tensorrt_llm.models.ChatGLMModel.forward", false]], "forward() (tensorrt_llm.models.clipvisiontransformer method)": [[84, "tensorrt_llm.models.CLIPVisionTransformer.forward", false]], "forward() (tensorrt_llm.models.decodermodel method)": [[84, "tensorrt_llm.models.DecoderModel.forward", false]], "forward() (tensorrt_llm.models.dit method)": [[84, "tensorrt_llm.models.DiT.forward", false]], "forward() (tensorrt_llm.models.eagleforcausallm method)": [[84, "tensorrt_llm.models.EagleForCausalLM.forward", false]], "forward() (tensorrt_llm.models.encodermodel method)": [[84, "tensorrt_llm.models.EncoderModel.forward", false]], "forward() (tensorrt_llm.models.falconmodel method)": [[84, "tensorrt_llm.models.FalconModel.forward", false]], "forward() (tensorrt_llm.models.gptjmodel method)": [[84, "tensorrt_llm.models.GPTJModel.forward", false]], "forward() (tensorrt_llm.models.gptmodel method)": [[84, "tensorrt_llm.models.GPTModel.forward", false]], "forward() (tensorrt_llm.models.gptneoxmodel method)": [[84, "tensorrt_llm.models.GPTNeoXModel.forward", false]], "forward() (tensorrt_llm.models.llamamodel method)": [[84, "tensorrt_llm.models.LLaMAModel.forward", false]], "forward() (tensorrt_llm.models.llavanextvisionwrapper method)": [[84, "tensorrt_llm.models.LlavaNextVisionWrapper.forward", false]], "forward() (tensorrt_llm.models.mambaforcausallm method)": [[84, "tensorrt_llm.models.MambaForCausalLM.forward", false]], "forward() (tensorrt_llm.models.mllamaforcausallm method)": [[84, "tensorrt_llm.models.MLLaMAForCausalLM.forward", false]], "forward() (tensorrt_llm.models.mptmodel method)": [[84, "tensorrt_llm.models.MPTModel.forward", false]], "forward() (tensorrt_llm.models.optmodel method)": [[84, "tensorrt_llm.models.OPTModel.forward", false]], "forward() (tensorrt_llm.models.phi3model method)": [[84, "tensorrt_llm.models.Phi3Model.forward", false]], "forward() (tensorrt_llm.models.phimodel method)": [[84, "tensorrt_llm.models.PhiModel.forward", false]], "forward() (tensorrt_llm.models.recurrentgemmaforcausallm method)": [[84, "tensorrt_llm.models.RecurrentGemmaForCausalLM.forward", false]], "forward() (tensorrt_llm.models.redrafterforcausallm method)": [[84, "tensorrt_llm.models.ReDrafterForCausalLM.forward", false]], "forward() (tensorrt_llm.models.sd3transformer2dmodel method)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.forward", false]], "forward() (tensorrt_llm.models.whisperencoder method)": [[84, "tensorrt_llm.models.WhisperEncoder.forward", false]], "forward_with_cfg() (tensorrt_llm.models.dit method)": [[84, "tensorrt_llm.models.DiT.forward_with_cfg", false]], "forward_without_cfg() (tensorrt_llm.models.dit method)": [[84, "tensorrt_llm.models.DiT.forward_without_cfg", false]], "fp8 (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.FP8", false]], "fp8_block_scales (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.FP8_BLOCK_SCALES", false]], "fp8_per_channel_per_token (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN", false]], "free_gpu_memory_fraction (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.free_gpu_memory_fraction", false]], "frequency_penalty (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.frequency_penalty", false]], "frequency_penalty (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.frequency_penalty", false]], "from_arguments() (tensorrt_llm.models.speculativedecodingmode static method)": [[84, "tensorrt_llm.models.SpeculativeDecodingMode.from_arguments", false]], "from_checkpoint() (tensorrt_llm.models.pretrainedconfig class method)": [[84, "tensorrt_llm.models.PretrainedConfig.from_checkpoint", false]], "from_checkpoint() (tensorrt_llm.models.pretrainedmodel class method)": [[84, "tensorrt_llm.models.PretrainedModel.from_checkpoint", false]], "from_config() (tensorrt_llm.models.pretrainedmodel class method)": [[84, "tensorrt_llm.models.PretrainedModel.from_config", false]], "from_dict() (tensorrt_llm.llmapi.buildconfig class method)": [[70, "tensorrt_llm.llmapi.BuildConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.calibconfig class method)": [[70, "tensorrt_llm.llmapi.CalibConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.eagledecodingconfig class method)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.lookaheaddecodingconfig class method)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.medusadecodingconfig class method)": [[70, "tensorrt_llm.llmapi.MedusaDecodingConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.mtpdecodingconfig class method)": [[70, "tensorrt_llm.llmapi.MTPDecodingConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.ngramdecodingconfig class method)": [[70, "tensorrt_llm.llmapi.NGramDecodingConfig.from_dict", false]], "from_dict() (tensorrt_llm.llmapi.quantconfig class method)": [[70, "tensorrt_llm.llmapi.QuantConfig.from_dict", false]], "from_dict() (tensorrt_llm.models.pretrainedconfig class method)": [[84, "tensorrt_llm.models.PretrainedConfig.from_dict", false]], "from_dir() (tensorrt_llm.runtime.modelrunner class method)": [[87, "tensorrt_llm.runtime.ModelRunner.from_dir", false]], "from_dir() (tensorrt_llm.runtime.modelrunnercpp class method)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.from_dir", false]], "from_engine() (tensorrt_llm.runtime.encdecmodelrunner class method)": [[87, "tensorrt_llm.runtime.EncDecModelRunner.from_engine", false]], "from_engine() (tensorrt_llm.runtime.modelrunner class method)": [[87, "tensorrt_llm.runtime.ModelRunner.from_engine", false]], "from_engine() (tensorrt_llm.runtime.session static method)": [[87, "tensorrt_llm.runtime.Session.from_engine", false]], "from_hugging_face() (tensorrt_llm.models.baichuanforcausallm class method)": [[84, "tensorrt_llm.models.BaichuanForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.chatglmconfig class method)": [[84, "tensorrt_llm.models.ChatGLMConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.chatglmforcausallm class method)": [[84, "tensorrt_llm.models.ChatGLMForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.cogvlmforcausallm class method)": [[84, "tensorrt_llm.models.CogVLMForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.cohereforcausallm class method)": [[84, "tensorrt_llm.models.CohereForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.deepseekforcausallm class method)": [[84, "tensorrt_llm.models.DeepseekForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.deepseekv2forcausallm class method)": [[84, "tensorrt_llm.models.DeepseekV2ForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.eagleforcausallm class method)": [[84, "tensorrt_llm.models.EagleForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.falconconfig class method)": [[84, "tensorrt_llm.models.FalconConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.falconforcausallm class method)": [[84, "tensorrt_llm.models.FalconForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gemmaconfig class method)": [[84, "tensorrt_llm.models.GemmaConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gemmaforcausallm class method)": [[84, "tensorrt_llm.models.GemmaForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gptconfig class method)": [[84, "tensorrt_llm.models.GPTConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gptforcausallm class method)": [[84, "tensorrt_llm.models.GPTForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gptjconfig class method)": [[84, "tensorrt_llm.models.GPTJConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.gptjforcausallm class method)": [[84, "tensorrt_llm.models.GPTJForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.llamaconfig class method)": [[84, "tensorrt_llm.models.LLaMAConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.llamaforcausallm class method)": [[84, "tensorrt_llm.models.LLaMAForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.llavanextvisionconfig class method)": [[84, "tensorrt_llm.models.LlavaNextVisionConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.llavanextvisionwrapper class method)": [[84, "tensorrt_llm.models.LlavaNextVisionWrapper.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.mambaforcausallm class method)": [[84, "tensorrt_llm.models.MambaForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.medusaconfig class method)": [[84, "tensorrt_llm.models.MedusaConfig.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.medusaforcausallm class method)": [[84, "tensorrt_llm.models.MedusaForCausalLm.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.mllamaforcausallm class method)": [[84, "tensorrt_llm.models.MLLaMAForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.phi3forcausallm class method)": [[84, "tensorrt_llm.models.Phi3ForCausalLM.from_hugging_face", false]], "from_hugging_face() (tensorrt_llm.models.phiforcausallm class method)": [[84, "tensorrt_llm.models.PhiForCausalLM.from_hugging_face", false]], "from_json_file() (tensorrt_llm.llmapi.buildconfig class method)": [[70, "tensorrt_llm.llmapi.BuildConfig.from_json_file", false]], "from_json_file() (tensorrt_llm.models.pretrainedconfig class method)": [[84, "tensorrt_llm.models.PretrainedConfig.from_json_file", false]], "from_meta_ckpt() (tensorrt_llm.models.llamaconfig class method)": [[84, "tensorrt_llm.models.LLaMAConfig.from_meta_ckpt", false]], "from_meta_ckpt() (tensorrt_llm.models.llamaforcausallm class method)": [[84, "tensorrt_llm.models.LLaMAForCausalLM.from_meta_ckpt", false]], "from_nemo() (tensorrt_llm.models.gptconfig class method)": [[84, "tensorrt_llm.models.GPTConfig.from_nemo", false]], "from_nemo() (tensorrt_llm.models.gptforcausallm class method)": [[84, "tensorrt_llm.models.GPTForCausalLM.from_nemo", false]], "from_pretrained() (tensorrt_llm.models.sd3transformer2dmodel class method)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.from_pretrained", false]], "from_serialized_engine() (tensorrt_llm.runtime.session static method)": [[87, "tensorrt_llm.runtime.Session.from_serialized_engine", false]], "from_string() (tensorrt_llm.functional.positionembeddingtype static method)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.from_string", false]], "from_string() (tensorrt_llm.functional.rotaryscalingtype static method)": [[82, "tensorrt_llm.functional.RotaryScalingType.from_string", false]], "fuse_qkv_projections() (tensorrt_llm.models.sd3transformer2dmodel method)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.fuse_qkv_projections", false]], "fusedgatedmlp (class in tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.FusedGatedMLP", false]], "fusedgatedmlp (tensorrt_llm.functional.mlptype attribute)": [[82, "tensorrt_llm.functional.MLPType.FusedGatedMLP", false]], "gatedmlp (class in tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.GatedMLP", false]], "gatedmlp (tensorrt_llm.functional.mlptype attribute)": [[82, "tensorrt_llm.functional.MLPType.GatedMLP", false]], "gather() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.gather", false]], "gather_context_logits (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.gather_context_logits", false]], "gather_context_logits (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.gather_context_logits", false]], "gather_context_logits (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.gather_context_logits", false]], "gather_context_logits (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.gather_context_logits", false]], "gather_context_logits (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.gather_context_logits", false]], "gather_generation_logits (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.gather_generation_logits", false]], "gather_generation_logits (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.gather_generation_logits", false]], "gather_generation_logits (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.gather_generation_logits", false]], "gather_generation_logits (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.gather_generation_logits", false]], "gather_generation_logits (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.gather_generation_logits", false]], "gather_last_token_logits() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.gather_last_token_logits", false]], "gather_nd() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.gather_nd", false]], "gegelu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.gegelu", false]], "geglu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.geglu", false]], "gelu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.gelu", false]], "gemm_allreduce() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.gemm_allreduce", false]], "gemm_allreduce_plugin (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.gemm_allreduce_plugin", false]], "gemm_allreduce_plugin (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.gemm_allreduce_plugin", false]], "gemm_swiglu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.gemm_swiglu", false]], "gemma2_added_fields (tensorrt_llm.models.gemmaconfig attribute)": [[84, "tensorrt_llm.models.GemmaConfig.GEMMA2_ADDED_FIELDS", false]], "gemma2_config() (tensorrt_llm.models.gemmaconfig method)": [[84, "tensorrt_llm.models.GemmaConfig.gemma2_config", false]], "gemma3_added_fields (tensorrt_llm.models.gemmaconfig attribute)": [[84, "tensorrt_llm.models.GemmaConfig.GEMMA3_ADDED_FIELDS", false]], "gemma3_config() (tensorrt_llm.models.gemmaconfig method)": [[84, "tensorrt_llm.models.GemmaConfig.gemma3_config", false]], "gemma_added_fields (tensorrt_llm.models.gemmaconfig attribute)": [[84, "tensorrt_llm.models.GemmaConfig.GEMMA_ADDED_FIELDS", false]], "gemmaconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GemmaConfig", false]], "gemmaforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GemmaForCausalLM", false]], "generate() (tensorrt_llm.llmapi.llm method)": [[70, "tensorrt_llm.llmapi.LLM.generate", false]], "generate() (tensorrt_llm.runtime.encdecmodelrunner method)": [[87, "tensorrt_llm.runtime.EncDecModelRunner.generate", false]], "generate() (tensorrt_llm.runtime.modelrunner method)": [[87, "tensorrt_llm.runtime.ModelRunner.generate", false]], "generate() (tensorrt_llm.runtime.modelrunnercpp method)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.generate", false]], "generate() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.generate", false]], "generate() (tensorrt_llm.runtime.qwenforcausallmgenerationsession method)": [[87, "tensorrt_llm.runtime.QWenForCausalLMGenerationSession.generate", false]], "generate_alibi_biases() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.generate_alibi_biases", false]], "generate_alibi_slopes() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.generate_alibi_slopes", false]], "generate_async() (tensorrt_llm.llmapi.llm method)": [[70, "tensorrt_llm.llmapi.LLM.generate_async", false]], "generate_logn_scaling() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.generate_logn_scaling", false]], "generation_logits (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.generation_logits", false]], "generationsequence (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.GenerationSequence", false]], "generationsession (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.GenerationSession", false]], "get_1d_sincos_pos_embed_from_grid() (in module tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.get_1d_sincos_pos_embed_from_grid", false]], "get_2d_sincos_pos_embed() (in module tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.get_2d_sincos_pos_embed", false]], "get_2d_sincos_pos_embed_from_grid() (in module tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.get_2d_sincos_pos_embed_from_grid", false]], "get_audio_features() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.get_audio_features", false]], "get_batch_idx() (tensorrt_llm.runtime.generationsequence method)": [[87, "tensorrt_llm.runtime.GenerationSequence.get_batch_idx", false]], "get_block_offsets() (tensorrt_llm.runtime.kvcachemanager method)": [[87, "tensorrt_llm.runtime.KVCacheManager.get_block_offsets", false]], "get_comm() (tensorrt_llm.llmapi.mpicommsession method)": [[70, "tensorrt_llm.llmapi.MpiCommSession.get_comm", false]], "get_config_group() (tensorrt_llm.models.pretrainedconfig method)": [[84, "tensorrt_llm.models.PretrainedConfig.get_config_group", false]], "get_context_phase_params() (tensorrt_llm.llmapi.disaggregatedparams method)": [[70, "tensorrt_llm.llmapi.DisaggregatedParams.get_context_phase_params", false]], "get_first_past_key_value() (tensorrt_llm.layers.attention.keyvaluecacheparams method)": [[83, "tensorrt_llm.layers.attention.KeyValueCacheParams.get_first_past_key_value", false]], "get_hf_config() (tensorrt_llm.models.gemmaconfig static method)": [[84, "tensorrt_llm.models.GemmaConfig.get_hf_config", false]], "get_kv_cache_events() (tensorrt_llm.llmapi.llm method)": [[70, "tensorrt_llm.llmapi.LLM.get_kv_cache_events", false]], "get_kv_cache_events_async() (tensorrt_llm.llmapi.llm method)": [[70, "tensorrt_llm.llmapi.LLM.get_kv_cache_events_async", false]], "get_next_medusa_tokens() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.get_next_medusa_tokens", false]], "get_num_heads_kv() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.get_num_heads_kv", false]], "get_parent() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.get_parent", false]], "get_pytorch_backend_config() (tensorrt_llm.llmapi.torchllmargs method)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.get_pytorch_backend_config", false]], "get_request_type() (tensorrt_llm.llmapi.disaggregatedparams method)": [[70, "tensorrt_llm.llmapi.DisaggregatedParams.get_request_type", false]], "get_rope_index() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.get_rope_index", false]], "get_seq_idx() (tensorrt_llm.runtime.generationsequence method)": [[87, "tensorrt_llm.runtime.GenerationSequence.get_seq_idx", false]], "get_stats() (tensorrt_llm.llmapi.llm method)": [[70, "tensorrt_llm.llmapi.LLM.get_stats", false]], "get_stats_async() (tensorrt_llm.llmapi.llm method)": [[70, "tensorrt_llm.llmapi.LLM.get_stats_async", false]], "get_timestep_embedding() (in module tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.get_timestep_embedding", false]], "get_users() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.get_users", false]], "get_visual_features() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.get_visual_features", false]], "get_weight() (tensorrt_llm.layers.linear.linearbase method)": [[83, "tensorrt_llm.layers.linear.LinearBase.get_weight", false]], "gpt_attention() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.gpt_attention", false]], "gpt_attention_plugin (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.gpt_attention_plugin", false]], "gptconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GPTConfig", false]], "gptforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GPTForCausalLM", false]], "gptjconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GPTJConfig", false]], "gptjforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GPTJForCausalLM", false]], "gptjmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GPTJModel", false]], "gptmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GPTModel", false]], "gptneoxforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GPTNeoXForCausalLM", false]], "gptneoxmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.GPTNeoXModel", false]], "gpu_weights_percent (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.gpu_weights_percent", false]], "grammar (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[70, "tensorrt_llm.llmapi.GuidedDecodingParams.grammar", false]], "greedy_sampling (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.greedy_sampling", false]], "group_norm() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.group_norm", false]], "group_size (tensorrt_llm.llmapi.quantconfig attribute)": [[70, "tensorrt_llm.llmapi.QuantConfig.group_size", false]], "groupnorm (class in tensorrt_llm.layers.normalization)": [[83, "tensorrt_llm.layers.normalization.GroupNorm", false]], "groupnorm (tensorrt_llm.functional.layernormtype attribute)": [[82, "tensorrt_llm.functional.LayerNormType.GroupNorm", false]], "gt() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.gt", false]], "guaranteed_no_evict (tensorrt_llm.llmapi.capacityschedulerpolicy attribute)": [[70, "tensorrt_llm.llmapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT", false]], "guided_decoding (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.guided_decoding", false]], "guideddecodingparams (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.GuidedDecodingParams", false]], "handle_per_step() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.handle_per_step", false]], "has_affine() (tensorrt_llm.functional.allreduceparams method)": [[82, "tensorrt_llm.functional.AllReduceParams.has_affine", false]], "has_bias() (tensorrt_llm.functional.allreduceparams method)": [[82, "tensorrt_llm.functional.AllReduceParams.has_bias", false]], "has_config_group() (tensorrt_llm.models.pretrainedconfig method)": [[84, "tensorrt_llm.models.PretrainedConfig.has_config_group", false]], "has_position_embedding (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.has_position_embedding", false]], "has_position_embedding (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.has_position_embedding", false]], "has_scale() (tensorrt_llm.functional.allreduceparams method)": [[82, "tensorrt_llm.functional.AllReduceParams.has_scale", false]], "has_token_type_embedding (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.has_token_type_embedding", false]], "has_token_type_embedding (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.has_token_type_embedding", false]], "has_zero_point (tensorrt_llm.llmapi.quantconfig attribute)": [[70, "tensorrt_llm.llmapi.QuantConfig.has_zero_point", false]], "head_size (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.head_size", false]], "head_size (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.head_size", false]], "hidden_size (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.hidden_size", false]], "hidden_size (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.hidden_size", false]], "hidden_size (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.hidden_size", false]], "hidden_size (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.hidden_size", false]], "host_cache_size (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.host_cache_size", false]], "identity() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.identity", false]], "ignore_eos (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.ignore_eos", false]], "include_stop_str_in_output (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.include_stop_str_in_output", false]], "index (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.index", false]], "index_select() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.index_select", false]], "infer_shapes() (tensorrt_llm.runtime.session method)": [[87, "tensorrt_llm.runtime.Session.infer_shapes", false]], "inflight (tensorrt_llm.llmapi.batchingtype attribute)": [[70, "tensorrt_llm.llmapi.BatchingType.INFLIGHT", false]], "init_audio_encoder() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.init_audio_encoder", false]], "init_image_encoder() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.init_image_encoder", false]], "init_llm() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.init_llm", false]], "init_processor() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.init_processor", false]], "init_tokenizer() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.init_tokenizer", false]], "input_timing_cache (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.input_timing_cache", false]], "int8 (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.INT8", false]], "int_clip() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.int_clip", false]], "interpolate() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.interpolate", false]], "is_alibi() (tensorrt_llm.functional.positionembeddingtype method)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.is_alibi", false]], "is_deferred() (tensorrt_llm.functional.positionembeddingtype method)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.is_deferred", false]], "is_dynamic() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.is_dynamic", false]], "is_gated_activation() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.is_gated_activation", false]], "is_gemma_2 (tensorrt_llm.models.gemmaconfig property)": [[84, "tensorrt_llm.models.GemmaConfig.is_gemma_2", false]], "is_gemma_3 (tensorrt_llm.models.gemmaconfig property)": [[84, "tensorrt_llm.models.GemmaConfig.is_gemma_3", false]], "is_keep_all (tensorrt_llm.llmapi.ngramdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.NGramDecodingConfig.is_keep_all", false]], "is_medusa_mode (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.is_medusa_mode", false]], "is_module_excluded_from_quantization() (tensorrt_llm.llmapi.quantconfig method)": [[70, "tensorrt_llm.llmapi.QuantConfig.is_module_excluded_from_quantization", false]], "is_mrope() (tensorrt_llm.functional.positionembeddingtype method)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.is_mrope", false]], "is_public_pool (tensorrt_llm.llmapi.ngramdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.NGramDecodingConfig.is_public_pool", false]], "is_redrafter_mode (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.is_redrafter_mode", false]], "is_rope() (tensorrt_llm.functional.positionembeddingtype method)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.is_rope", false]], "is_trt_wrapper() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.is_trt_wrapper", false]], "is_use_oldest (tensorrt_llm.llmapi.ngramdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.NGramDecodingConfig.is_use_oldest", false]], "is_valid() (tensorrt_llm.layers.attention.attentionparams method)": [[83, "tensorrt_llm.layers.attention.AttentionParams.is_valid", false]], "is_valid() (tensorrt_llm.layers.attention.keyvaluecacheparams method)": [[83, "tensorrt_llm.layers.attention.KeyValueCacheParams.is_valid", false]], "is_valid_cross_attn() (tensorrt_llm.layers.attention.attentionparams method)": [[83, "tensorrt_llm.layers.attention.AttentionParams.is_valid_cross_attn", false]], "joint_attn_forward() (tensorrt_llm.layers.attention.diffusersattention method)": [[83, "tensorrt_llm.layers.attention.DiffusersAttention.joint_attn_forward", false]], "json (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[70, "tensorrt_llm.llmapi.GuidedDecodingParams.json", false]], "json_object (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[70, "tensorrt_llm.llmapi.GuidedDecodingParams.json_object", false]], "keyvaluecacheparams (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.KeyValueCacheParams", false]], "kv_cache_dtype (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.kv_cache_dtype", false]], "kv_cache_quant_algo (tensorrt_llm.llmapi.quantconfig attribute)": [[70, "tensorrt_llm.llmapi.QuantConfig.kv_cache_quant_algo", false]], "kv_cache_type (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.kv_cache_type", false]], "kv_cache_type (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.kv_cache_type", false]], "kv_cache_type (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.kv_cache_type", false]], "kv_dtype (tensorrt_llm.models.pretrainedconfig property)": [[84, "tensorrt_llm.models.PretrainedConfig.kv_dtype", false]], "kvcacheconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.KvCacheConfig", false]], "kvcachemanager (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.KVCacheManager", false]], "kvcacheretentionconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig", false]], "kvcacheretentionconfig.tokenrangeretentionconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig", false]], "labelembedding (class in tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.LabelEmbedding", false]], "language_adapter_config (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.language_adapter_config", false]], "last_layer (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.last_layer", false]], "last_process_for_ub (tensorrt_llm.functional.allreducefusionop attribute)": [[82, "tensorrt_llm.functional.AllReduceFusionOp.LAST_PROCESS_FOR_UB", false]], "layer_norm() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.layer_norm", false]], "layer_quant_mode (tensorrt_llm.llmapi.quantconfig property)": [[70, "tensorrt_llm.llmapi.QuantConfig.layer_quant_mode", false]], "layer_types (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.layer_types", false]], "layernorm (class in tensorrt_llm.layers.normalization)": [[83, "tensorrt_llm.layers.normalization.LayerNorm", false]], "layernorm (tensorrt_llm.functional.layernormtype attribute)": [[82, "tensorrt_llm.functional.LayerNormType.LayerNorm", false]], "layernormpositiontype (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.LayerNormPositionType", false]], "layernormtype (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.LayerNormType", false]], "learned_absolute (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.learned_absolute", false]], "length (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.length", false]], "length (tensorrt_llm.llmapi.completionoutput property)": [[70, "id2", false]], "length_penalty (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.length_penalty", false]], "length_penalty (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.length_penalty", false]], "linear (class in tensorrt_llm.layers.linear)": [[83, "tensorrt_llm.layers.linear.Linear", false]], "linear (tensorrt_llm.functional.rotaryscalingtype attribute)": [[82, "tensorrt_llm.functional.RotaryScalingType.linear", false]], "linearactivation (class in tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.LinearActivation", false]], "linearapproximategelu (class in tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.LinearApproximateGELU", false]], "linearbase (class in tensorrt_llm.layers.linear)": [[83, "tensorrt_llm.layers.linear.LinearBase", false]], "lineargeglu (class in tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.LinearGEGLU", false]], "lineargelu (class in tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.LinearGELU", false]], "linearswiglu (class in tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.LinearSwiGLU", false]], "llama3 (tensorrt_llm.functional.rotaryscalingtype attribute)": [[82, "tensorrt_llm.functional.RotaryScalingType.llama3", false]], "llamaconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.LLaMAConfig", false]], "llamaforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.LLaMAForCausalLM", false]], "llamamodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.LLaMAModel", false]], "llavanextvisionconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.LlavaNextVisionConfig", false]], "llavanextvisionwrapper (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.LlavaNextVisionWrapper", false]], "llm (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.LLM", false]], "llm_engine_dir (tensorrt_llm.runtime.multimodalmodelrunner property)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.llm_engine_dir", false]], "llmargs (in module tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.LlmArgs", false]], "load() (tensorrt_llm.models.pretrainedmodel method)": [[84, "tensorrt_llm.models.PretrainedModel.load", false]], "load() (tensorrt_llm.models.sd3transformer2dmodel method)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.load", false]], "load_format (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.load_format", false]], "load_test_audio() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.load_test_audio", false]], "load_test_data() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.load_test_data", false]], "locate_accepted_draft_tokens() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.locate_accepted_draft_tokens", false]], "location (tensorrt_llm.functional.tensor property)": [[82, "tensorrt_llm.functional.Tensor.location", false]], "log() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.log", false]], "log() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.log", false]], "log_softmax() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.log_softmax", false]], "logits_processor (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.logits_processor", false]], "logitsprocessor (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.LogitsProcessor", false]], "logitsprocessorlist (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.LogitsProcessorList", false]], "logprobs (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.logprobs", false]], "logprobs (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.logprobs", false]], "logprobs_diff (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.logprobs_diff", false]], "logprobs_diff (tensorrt_llm.llmapi.completionoutput property)": [[70, "id3", false]], "long_rope (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.long_rope", false]], "longrope (tensorrt_llm.functional.rotaryscalingtype attribute)": [[82, "tensorrt_llm.functional.RotaryScalingType.longrope", false]], "lookahead_config (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.lookahead_config", false]], "lookahead_decoding (tensorrt_llm.models.speculativedecodingmode attribute)": [[84, "tensorrt_llm.models.SpeculativeDecodingMode.LOOKAHEAD_DECODING", false]], "lookaheaddecodingconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig", false]], "lora_config (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.lora_config", false]], "lora_plugin (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.lora_plugin", false]], "lora_plugin() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.lora_plugin", false]], "lora_target_modules (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.lora_target_modules", false]], "low_latency_gemm() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.low_latency_gemm", false]], "low_latency_gemm_swiglu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.low_latency_gemm_swiglu", false]], "lowprecision (tensorrt_llm.functional.allreducestrategy attribute)": [[82, "tensorrt_llm.functional.AllReduceStrategy.LOWPRECISION", false]], "lt() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.lt", false]], "make_causal_mask() (in module tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.make_causal_mask", false]], "mamba_conv1d() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.mamba_conv1d", false]], "mamba_conv1d_plugin (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.mamba_conv1d_plugin", false]], "mambaforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.MambaForCausalLM", false]], "mapping (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.mapping", false]], "mapping (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.mapping", false]], "mark_output() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.mark_output", false]], "masked_scatter() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.masked_scatter", false]], "masked_select() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.masked_select", false]], "matmul() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.matmul", false]], "max() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.max", false]], "max() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.max", false]], "max_attention_window (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.max_attention_window", false]], "max_attention_window_size (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.max_attention_window_size", false]], "max_batch_size (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.max_batch_size", false]], "max_batch_size (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.max_batch_size", false]], "max_beam_width (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.max_beam_width", false]], "max_beam_width (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.max_beam_width", false]], "max_cache_storage_gb (tensorrt_llm.llmapi.buildcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildCacheConfig.max_cache_storage_gb", false]], "max_cache_storage_gb (tensorrt_llm.llmapi.buildcacheconfig property)": [[70, "id8", false]], "max_cpu_loras (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.max_cpu_loras", false]], "max_cpu_loras (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.max_cpu_loras", false]], "max_draft_len (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.max_draft_len", false]], "max_draft_tokens (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.max_draft_tokens", false]], "max_encoder_input_len (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.max_encoder_input_len", false]], "max_input_len (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.max_input_len", false]], "max_lora_rank (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.max_lora_rank", false]], "max_lora_rank (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.max_lora_rank", false]], "max_loras (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.max_loras", false]], "max_loras (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.max_loras", false]], "max_matching_ngram_size (tensorrt_llm.llmapi.ngramdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.NGramDecodingConfig.max_matching_ngram_size", false]], "max_medusa_tokens (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.max_medusa_tokens", false]], "max_new_tokens (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.max_new_tokens", false]], "max_ngram_size (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size", false]], "max_non_leaves_per_layer (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.max_non_leaves_per_layer", false]], "max_num_tokens (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.max_num_tokens", false]], "max_num_tokens (tensorrt_llm.llmapi.cachetransceiverconfig attribute)": [[70, "tensorrt_llm.llmapi.CacheTransceiverConfig.max_num_tokens", false]], "max_prompt_embedding_table_size (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.max_prompt_embedding_table_size", false]], "max_prompt_embedding_table_size (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.max_prompt_embedding_table_size", false]], "max_prompt_embedding_table_size (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.max_prompt_embedding_table_size", false]], "max_prompt_embedding_table_size (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.max_prompt_embedding_table_size", false]], "max_prompt_embedding_table_size (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.max_prompt_embedding_table_size", false]], "max_records (tensorrt_llm.llmapi.buildcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildCacheConfig.max_records", false]], "max_records (tensorrt_llm.llmapi.buildcacheconfig property)": [[70, "id9", false]], "max_seq_len (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.max_seq_len", false]], "max_sequence_length (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.max_sequence_length", false]], "max_sequence_length (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.max_sequence_length", false]], "max_tokens (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.max_tokens", false]], "max_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.max_tokens", false]], "max_utilization (tensorrt_llm.llmapi.capacityschedulerpolicy attribute)": [[70, "tensorrt_llm.llmapi.CapacitySchedulerPolicy.MAX_UTILIZATION", false]], "max_verification_set_size (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size", false]], "max_window_size (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size", false]], "maximum() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.maximum", false]], "mean() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.mean", false]], "mean() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.mean", false]], "medusa (tensorrt_llm.models.speculativedecodingmode attribute)": [[84, "tensorrt_llm.models.SpeculativeDecodingMode.MEDUSA", false]], "medusa_choices (tensorrt_llm.llmapi.medusadecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MedusaDecodingConfig.medusa_choices", false]], "medusa_decode_and_verify() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.medusa_decode_and_verify", false]], "medusa_paths (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.medusa_paths", false]], "medusa_position_offsets (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.medusa_position_offsets", false]], "medusa_temperature (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.medusa_temperature", false]], "medusa_topks (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.medusa_topks", false]], "medusa_tree_ids (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.medusa_tree_ids", false]], "medusaconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.MedusaConfig", false]], "medusadecodingconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.MedusaDecodingConfig", false]], "medusaforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.MedusaForCausalLm", false]], "meshgrid2d() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.meshgrid2d", false]], "min() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.min", false]], "min_latency (tensorrt_llm.functional.allreducestrategy attribute)": [[82, "tensorrt_llm.functional.AllReduceStrategy.MIN_LATENCY", false]], "min_length (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.min_length", false]], "min_p (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.min_p", false]], "min_p (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.min_p", false]], "min_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.min_tokens", false]], "minimum() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.minimum", false]], "mish (class in tensorrt_llm.layers.activation)": [[83, "tensorrt_llm.layers.activation.Mish", false]], "mixed_precision (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.MIXED_PRECISION", false]], "mixed_sampler (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.mixed_sampler", false]], "mllamaforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.MLLaMAForCausalLM", false]], "mlp (class in tensorrt_llm.layers.mlp)": [[83, "tensorrt_llm.layers.mlp.MLP", false]], "mlp (tensorrt_llm.functional.mlptype attribute)": [[82, "tensorrt_llm.functional.MLPType.MLP", false]], "mlptype (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.MLPType", false]], "model": [[30, "cmdoption-trtllm-serve-serve-arg-MODEL", false]], "model_config (tensorrt_llm.llmapi.cachetransceiverconfig attribute)": [[70, "tensorrt_llm.llmapi.CacheTransceiverConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.calibconfig attribute)": [[70, "tensorrt_llm.llmapi.CalibConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.dynamicbatchconfig attribute)": [[70, "tensorrt_llm.llmapi.DynamicBatchConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[70, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.lookaheaddecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.medusadecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MedusaDecodingConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MTPDecodingConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.ngramdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.NGramDecodingConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.schedulerconfig attribute)": [[70, "tensorrt_llm.llmapi.SchedulerConfig.model_config", false]], "model_config (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.model_config", false]], "model_config (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.model_config", false]], "model_name (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.model_name", false]], "model_post_init() (tensorrt_llm.llmapi.torchllmargs method)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.model_post_init", false]], "model_post_init() (tensorrt_llm.llmapi.trtllmargs method)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.model_post_init", false]], "modelconfig (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.ModelConfig", false]], "modelrunner (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.ModelRunner", false]], "modelrunnercpp (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp", false]], "module": [[82, "module-tensorrt_llm", false], [82, "module-tensorrt_llm.functional", false], [83, "module-tensorrt_llm", false], [83, "module-tensorrt_llm.layers.activation", false], [83, "module-tensorrt_llm.layers.attention", false], [83, "module-tensorrt_llm.layers.cast", false], [83, "module-tensorrt_llm.layers.conv", false], [83, "module-tensorrt_llm.layers.embedding", false], [83, "module-tensorrt_llm.layers.linear", false], [83, "module-tensorrt_llm.layers.mlp", false], [83, "module-tensorrt_llm.layers.normalization", false], [83, "module-tensorrt_llm.layers.pooling", false], [84, "module-tensorrt_llm", false], [84, "module-tensorrt_llm.models", false], [85, "module-tensorrt_llm", false], [85, "module-tensorrt_llm.plugin", false], [86, "module-tensorrt_llm", false], [86, "module-tensorrt_llm.quantization", false], [87, "module-tensorrt_llm", false], [87, "module-tensorrt_llm.runtime", false]], "modulo() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.modulo", false]], "moe (tensorrt_llm.functional.sidestreamidtype attribute)": [[82, "tensorrt_llm.functional.SideStreamIDType.moe", false]], "moe_allreduce_residual_rms_norm (tensorrt_llm.functional.allreducefusionop attribute)": [[82, "tensorrt_llm.functional.AllReduceFusionOp.MOE_ALLREDUCE_RESIDUAL_RMS_NORM", false]], "moe_backend (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.moe_backend", false]], "moe_load_balancer (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.moe_load_balancer", false]], "moe_max_num_tokens (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.moe_max_num_tokens", false]], "monitor_memory (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.monitor_memory", false]], "mpicommsession (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.MpiCommSession", false]], "mptforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.MPTForCausalLM", false]], "mptmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.MPTModel", false]], "mrope (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.mrope", false]], "mrope (tensorrt_llm.functional.rotaryscalingtype attribute)": [[82, "tensorrt_llm.functional.RotaryScalingType.mrope", false]], "mropeparams (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.MropeParams", false]], "msg (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "id10", false], [70, "id13", false], [70, "id16", false], [70, "tensorrt_llm.llmapi.TorchLlmArgs.msg", false]], "msg (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "id19", false], [70, "id22", false], [70, "id25", false], [70, "id28", false], [70, "id31", false], [70, "tensorrt_llm.llmapi.TrtLlmArgs.msg", false]], "mtpdecodingconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.MTPDecodingConfig", false]], "mul() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.mul", false]], "multi_block_mode (tensorrt_llm.llmapi.extendedruntimeperfknobconfig attribute)": [[70, "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.multi_block_mode", false]], "multimodalmodelrunner (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner", false]], "multiply_and_lora() (tensorrt_llm.layers.linear.linearbase method)": [[83, "tensorrt_llm.layers.linear.LinearBase.multiply_and_lora", false]], "multiply_collect() (tensorrt_llm.layers.linear.linearbase method)": [[83, "tensorrt_llm.layers.linear.LinearBase.multiply_collect", false]], "multiply_collect() (tensorrt_llm.layers.linear.rowlinear method)": [[83, "tensorrt_llm.layers.linear.RowLinear.multiply_collect", false]], "n (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.n", false]], "name (tensorrt_llm.functional.tensor property)": [[82, "tensorrt_llm.functional.Tensor.name", false]], "name (tensorrt_llm.runtime.tensorinfo attribute)": [[87, "tensorrt_llm.runtime.TensorInfo.name", false]], "native_quant_flow (tensorrt_llm.models.gemmaforcausallm attribute)": [[84, "tensorrt_llm.models.GemmaForCausalLM.NATIVE_QUANT_FLOW", false]], "nccl (tensorrt_llm.functional.allreducestrategy attribute)": [[82, "tensorrt_llm.functional.AllReduceStrategy.NCCL", false]], "ndim() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.ndim", false]], "network (tensorrt_llm.functional.tensor property)": [[82, "tensorrt_llm.functional.Tensor.network", false]], "next_medusa_input_ids() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.next_medusa_input_ids", false]], "ngram (tensorrt_llm.models.speculativedecodingmode attribute)": [[84, "tensorrt_llm.models.SpeculativeDecodingMode.NGRAM", false]], "ngramdecodingconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.NGramDecodingConfig", false]], "no_quant (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.NO_QUANT", false]], "no_repeat_ngram_size (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.no_repeat_ngram_size", false]], "no_repeat_ngram_size (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.no_repeat_ngram_size", false]], "non_gated_version() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.non_gated_version", false]], "none (tensorrt_llm.functional.allreducefusionop attribute)": [[82, "tensorrt_llm.functional.AllReduceFusionOp.NONE", false]], "none (tensorrt_llm.functional.rotaryscalingtype attribute)": [[82, "tensorrt_llm.functional.RotaryScalingType.none", false]], "none (tensorrt_llm.models.speculativedecodingmode attribute)": [[84, "tensorrt_llm.models.SpeculativeDecodingMode.NONE", false]], "nonzero() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.nonzero", false]], "not_op() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.not_op", false]], "num_beams (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.num_beams", false]], "num_draft_tokens (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.num_draft_tokens", false]], "num_eagle_layers (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.num_eagle_layers", false]], "num_heads (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.num_heads", false]], "num_heads (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.num_heads", false]], "num_heads (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.num_heads", false]], "num_heads (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.num_heads", false]], "num_kv_heads (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.num_kv_heads", false]], "num_kv_heads_per_cross_attn_layer (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_cross_attn_layer", false]], "num_kv_heads_per_layer (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.num_kv_heads_per_layer", false]], "num_layers (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.num_layers", false]], "num_layers (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.num_layers", false]], "num_layers (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.num_layers", false]], "num_layers (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.num_layers", false]], "num_medusa_heads (tensorrt_llm.llmapi.medusadecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MedusaDecodingConfig.num_medusa_heads", false]], "num_medusa_heads (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.num_medusa_heads", false]], "num_medusa_heads (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.num_medusa_heads", false]], "num_nextn_predict_layers (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MTPDecodingConfig.num_nextn_predict_layers", false]], "num_return_sequences (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.num_return_sequences", false]], "numel() (tensorrt_llm.runtime.tensorinfo method)": [[87, "tensorrt_llm.runtime.TensorInfo.numel", false]], "nvfp4 (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.NVFP4", false]], "nvinfer1 (c++ type)": [[1, "_CPPv48nvinfer1", false]], "onboard_blocks (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.onboard_blocks", false]], "oneshot (tensorrt_llm.functional.allreducestrategy attribute)": [[82, "tensorrt_llm.functional.AllReduceStrategy.ONESHOT", false]], "op_and() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.op_and", false]], "op_or() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.op_or", false]], "op_xor() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.op_xor", false]], "opaque_state (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[70, "tensorrt_llm.llmapi.DisaggregatedParams.opaque_state", false]], "opt_batch_size (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.opt_batch_size", false]], "opt_num_tokens (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.opt_num_tokens", false]], "optforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.OPTForCausalLM", false]], "optmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.OPTModel", false]], "outer() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.outer", false]], "output_cum_log_probs (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.output_cum_log_probs", false]], "output_log_probs (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.output_log_probs", false]], "output_sequence_lengths (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.output_sequence_lengths", false]], "output_timing_cache (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.output_timing_cache", false]], "outputs (tensorrt_llm.llmapi.requestoutput attribute)": [[70, "tensorrt_llm.llmapi.RequestOutput.outputs", false]], "pad() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.pad", false]], "pad_id (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.pad_id", false]], "pad_id (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.pad_id", false]], "padding (tensorrt_llm.functional.attentionmasktype attribute)": [[82, "tensorrt_llm.functional.AttentionMaskType.padding", false]], "paged_kv_cache (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.paged_kv_cache", false]], "paged_state (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.paged_state", false]], "paged_state (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.paged_state", false]], "permute() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.permute", false]], "permute() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.permute", false]], "phi3forcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.Phi3ForCausalLM", false]], "phi3model (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.Phi3Model", false]], "phiforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.PhiForCausalLM", false]], "phimodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.PhiModel", false]], "pixartalphatextprojection (class in tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.PixArtAlphaTextProjection", false]], "plugin_config (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.plugin_config", false]], "pluginconfig (class in tensorrt_llm.plugin)": [[85, "tensorrt_llm.plugin.PluginConfig", false]], "positionembeddingtype (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.PositionEmbeddingType", false]], "post_layernorm (tensorrt_llm.functional.layernormpositiontype attribute)": [[82, "tensorrt_llm.functional.LayerNormPositionType.post_layernorm", false]], "posterior_threshold (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.posterior_threshold", false]], "postprocess() (tensorrt_llm.layers.attention.attention method)": [[83, "tensorrt_llm.layers.attention.Attention.postprocess", false]], "postprocess() (tensorrt_llm.layers.attention.deepseekv2attention method)": [[83, "tensorrt_llm.layers.attention.DeepseekV2Attention.postprocess", false]], "postprocess() (tensorrt_llm.layers.embedding.embedding method)": [[83, "tensorrt_llm.layers.embedding.Embedding.postprocess", false]], "postprocess() (tensorrt_llm.layers.linear.linear method)": [[83, "tensorrt_llm.layers.linear.Linear.postprocess", false]], "pow() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.pow", false]], "pp_communicate_final_output_ids() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.pp_communicate_final_output_ids", false]], "pp_communicate_new_tokens() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.pp_communicate_new_tokens", false]], "pre_layernorm (tensorrt_llm.functional.layernormpositiontype attribute)": [[82, "tensorrt_llm.functional.LayerNormPositionType.pre_layernorm", false]], "pre_quant_scale (tensorrt_llm.llmapi.quantconfig attribute)": [[70, "tensorrt_llm.llmapi.QuantConfig.pre_quant_scale", false]], "precompute_relative_attention_bias() (tensorrt_llm.models.decodermodel method)": [[84, "tensorrt_llm.models.DecoderModel.precompute_relative_attention_bias", false]], "precompute_relative_attention_bias() (tensorrt_llm.models.encodermodel method)": [[84, "tensorrt_llm.models.EncoderModel.precompute_relative_attention_bias", false]], "precompute_relative_attention_bias() (tensorrt_llm.models.whisperencoder method)": [[84, "tensorrt_llm.models.WhisperEncoder.precompute_relative_attention_bias", false]], "prepare_inputs() (tensorrt_llm.models.chatglmforcausallm method)": [[84, "tensorrt_llm.models.ChatGLMForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.decodermodel method)": [[84, "tensorrt_llm.models.DecoderModel.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.dit method)": [[84, "tensorrt_llm.models.DiT.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.eagleforcausallm method)": [[84, "tensorrt_llm.models.EagleForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.encodermodel method)": [[84, "tensorrt_llm.models.EncoderModel.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.llavanextvisionwrapper method)": [[84, "tensorrt_llm.models.LlavaNextVisionWrapper.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.mambaforcausallm method)": [[84, "tensorrt_llm.models.MambaForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.mllamaforcausallm method)": [[84, "tensorrt_llm.models.MLLaMAForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.pretrainedmodel method)": [[84, "tensorrt_llm.models.PretrainedModel.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.recurrentgemmaforcausallm method)": [[84, "tensorrt_llm.models.RecurrentGemmaForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.redrafterforcausallm method)": [[84, "tensorrt_llm.models.ReDrafterForCausalLM.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.sd3transformer2dmodel method)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.prepare_inputs", false]], "prepare_inputs() (tensorrt_llm.models.whisperencoder method)": [[84, "tensorrt_llm.models.WhisperEncoder.prepare_inputs", false]], "prepare_position_ids_for_cogvlm() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.prepare_position_ids_for_cogvlm", false]], "prepare_recurrent_inputs() (tensorrt_llm.models.recurrentgemmaforcausallm method)": [[84, "tensorrt_llm.models.RecurrentGemmaForCausalLM.prepare_recurrent_inputs", false]], "preprocess() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.preprocess", false]], "presence_penalty (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.presence_penalty", false]], "presence_penalty (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.presence_penalty", false]], "pretrainedconfig (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.PretrainedConfig", false]], "pretrainedmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.PretrainedModel", false]], "print_iter_log (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.print_iter_log", false]], "priority (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig property)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.priority", false]], "process_input() (tensorrt_llm.runtime.encdecmodelrunner method)": [[87, "tensorrt_llm.runtime.EncDecModelRunner.process_input", false]], "process_logits_including_draft() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.process_logits_including_draft", false]], "prod() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.prod", false]], "profiler (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.profiler", false]], "profiling_verbosity (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.profiling_verbosity", false]], "prompt (tensorrt_llm.llmapi.requestoutput attribute)": [[70, "tensorrt_llm.llmapi.RequestOutput.prompt", false]], "prompt (tensorrt_llm.llmapi.requestoutput property)": [[70, "id6", false]], "prompt_logprobs (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.prompt_logprobs", false]], "prompt_logprobs (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.prompt_logprobs", false]], "prompt_lookup_num_tokens (tensorrt_llm.llmapi.ngramdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.NGramDecodingConfig.prompt_lookup_num_tokens", false]], "prompt_token_ids (tensorrt_llm.llmapi.requestoutput attribute)": [[70, "tensorrt_llm.llmapi.RequestOutput.prompt_token_ids", false]], "prompttuningembedding (class in tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.PromptTuningEmbedding", false]], "ptuning_setup() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup", false]], "ptuning_setup_fuyu() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_fuyu", false]], "ptuning_setup_llava_next() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_llava_next", false]], "ptuning_setup_phi3() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_phi3", false]], "ptuning_setup_pixtral() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.ptuning_setup_pixtral", false]], "python_e2e (tensorrt_llm.runtime.multimodalmodelrunner property)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.python_e2e", false]], "pytorch_eagle_weights_path (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.pytorch_eagle_weights_path", false]], "quant_algo (tensorrt_llm.llmapi.quantconfig attribute)": [[70, "tensorrt_llm.llmapi.QuantConfig.quant_algo", false]], "quant_algo (tensorrt_llm.models.pretrainedconfig property)": [[84, "tensorrt_llm.models.PretrainedConfig.quant_algo", false]], "quant_mode (tensorrt_llm.llmapi.quantconfig property)": [[70, "tensorrt_llm.llmapi.QuantConfig.quant_mode", false]], "quant_mode (tensorrt_llm.models.pretrainedconfig property)": [[84, "tensorrt_llm.models.PretrainedConfig.quant_mode", false]], "quant_mode (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.quant_mode", false]], "quant_mode (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.quant_mode", false]], "quantalgo (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.QuantAlgo", false]], "quantalgo (class in tensorrt_llm.quantization)": [[86, "tensorrt_llm.quantization.QuantAlgo", false]], "quantconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.QuantConfig", false]], "quantize() (tensorrt_llm.models.baichuanforcausallm class method)": [[84, "tensorrt_llm.models.BaichuanForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.chatglmforcausallm class method)": [[84, "tensorrt_llm.models.ChatGLMForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.cogvlmforcausallm class method)": [[84, "tensorrt_llm.models.CogVLMForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.gemmaforcausallm class method)": [[84, "tensorrt_llm.models.GemmaForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.gptforcausallm class method)": [[84, "tensorrt_llm.models.GPTForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.llamaforcausallm class method)": [[84, "tensorrt_llm.models.LLaMAForCausalLM.quantize", false]], "quantize() (tensorrt_llm.models.pretrainedmodel class method)": [[84, "tensorrt_llm.models.PretrainedModel.quantize", false]], "quantize_and_export() (in module tensorrt_llm.quantization)": [[86, "tensorrt_llm.quantization.quantize_and_export", false]], "quantmode (class in tensorrt_llm.quantization)": [[86, "tensorrt_llm.quantization.QuantMode", false]], "quick_gelu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.quick_gelu", false]], "qwenforcausallmgenerationsession (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.QWenForCausalLMGenerationSession", false]], "rand() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.rand", false]], "random_seed (tensorrt_llm.llmapi.calibconfig attribute)": [[70, "tensorrt_llm.llmapi.CalibConfig.random_seed", false]], "random_seed (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.random_seed", false]], "rank() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.rank", false]], "rearrange() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.rearrange", false]], "recurrentgemmaforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.RecurrentGemmaForCausalLM", false]], "recv() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.recv", false]], "redrafter_draft_len_per_beam (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.redrafter_draft_len_per_beam", false]], "redrafter_num_beams (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.redrafter_num_beams", false]], "redrafterforcausallm (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.ReDrafterForCausalLM", false]], "reduce() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.reduce", false]], "reduce_scatter() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.reduce_scatter", false]], "regex (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[70, "tensorrt_llm.llmapi.GuidedDecodingParams.regex", false]], "relative (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.relative", false]], "relaxed_delta (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_delta", false]], "relaxed_topk (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_topk", false]], "release() (tensorrt_llm.models.pretrainedmodel method)": [[84, "tensorrt_llm.models.PretrainedModel.release", false]], "relu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.relu", false]], "remove_input_padding (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.remove_input_padding", false]], "remove_input_padding (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.remove_input_padding", false]], "remove_input_padding (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.remove_input_padding", false]], "remove_input_padding (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.remove_input_padding", false]], "reorder_kv_cache_for_beam_search() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.reorder_kv_cache_for_beam_search", false]], "repeat() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.repeat", false]], "repeat() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.repeat", false]], "repeat_interleave() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.repeat_interleave", false]], "repetition_penalty (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.repetition_penalty", false]], "repetition_penalty (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.repetition_penalty", false]], "replace_all_uses_with() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.replace_all_uses_with", false]], "request_id (tensorrt_llm.llmapi.requestoutput attribute)": [[70, "tensorrt_llm.llmapi.RequestOutput.request_id", false]], "request_type (tensorrt_llm.llmapi.disaggregatedparams attribute)": [[70, "tensorrt_llm.llmapi.DisaggregatedParams.request_type", false]], "requesterror (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.RequestError", false]], "requestoutput (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.RequestOutput", false]], "residual_rms_norm (tensorrt_llm.functional.allreducefusionop attribute)": [[82, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM", false]], "residual_rms_norm_out_quant_fp8 (tensorrt_llm.functional.allreducefusionop attribute)": [[82, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_FP8", false]], "residual_rms_norm_out_quant_nvfp4 (tensorrt_llm.functional.allreducefusionop attribute)": [[82, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4", false]], "residual_rms_norm_quant_fp8 (tensorrt_llm.functional.allreducefusionop attribute)": [[82, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_FP8", false]], "residual_rms_norm_quant_nvfp4 (tensorrt_llm.functional.allreducefusionop attribute)": [[82, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_NORM_QUANT_NVFP4", false]], "residual_rms_prepost_norm (tensorrt_llm.functional.allreducefusionop attribute)": [[82, "tensorrt_llm.functional.AllReduceFusionOp.RESIDUAL_RMS_PREPOST_NORM", false]], "return_context_logits (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.return_context_logits", false]], "return_dict (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.return_dict", false]], "return_encoder_output (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.return_encoder_output", false]], "return_generation_logits (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.return_generation_logits", false]], "return_perf_metrics (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.return_perf_metrics", false]], "rg_lru() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.rg_lru", false]], "rms_norm() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.rms_norm", false]], "rmsnorm (class in tensorrt_llm.layers.normalization)": [[83, "tensorrt_llm.layers.normalization.RmsNorm", false]], "rmsnorm (tensorrt_llm.functional.layernormtype attribute)": [[82, "tensorrt_llm.functional.LayerNormType.RmsNorm", false]], "rnn_conv_dim_size (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.rnn_conv_dim_size", false]], "rnn_conv_dim_size (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.rnn_conv_dim_size", false]], "rnn_head_size (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.rnn_head_size", false]], "rnn_head_size (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.rnn_head_size", false]], "rnn_hidden_size (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.rnn_hidden_size", false]], "rnn_hidden_size (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.rnn_hidden_size", false]], "robertaforquestionanswering (in module tensorrt_llm.models)": [[84, "tensorrt_llm.models.RobertaForQuestionAnswering", false]], "robertaforsequenceclassification (in module tensorrt_llm.models)": [[84, "tensorrt_llm.models.RobertaForSequenceClassification", false]], "robertamodel (in module tensorrt_llm.models)": [[84, "tensorrt_llm.models.RobertaModel", false]], "rope_gpt_neox (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.rope_gpt_neox", false]], "rope_gptj (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.rope_gptj", false]], "ropeembeddingutils (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils", false]], "rotaryscalingtype (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.RotaryScalingType", false]], "rotate_every_two() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.rotate_every_two", false]], "rotate_half() (tensorrt_llm.functional.ropeembeddingutils static method)": [[82, "tensorrt_llm.functional.RopeEmbeddingUtils.rotate_half", false]], "round() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.round", false]], "rowlinear (class in tensorrt_llm.layers.linear)": [[83, "tensorrt_llm.layers.linear.RowLinear", false]], "run() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.run", false]], "run() (tensorrt_llm.runtime.session method)": [[87, "tensorrt_llm.runtime.Session.run", false]], "runtime (tensorrt_llm.runtime.generationsession attribute)": [[87, "tensorrt_llm.runtime.GenerationSession.runtime", false]], "runtime (tensorrt_llm.runtime.session property)": [[87, "tensorrt_llm.runtime.Session.runtime", false]], "samplingconfig (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.SamplingConfig", false]], "samplingparams (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.SamplingParams", false]], "save() (tensorrt_llm.llmapi.llm method)": [[70, "tensorrt_llm.llmapi.LLM.save", false]], "save_checkpoint() (tensorrt_llm.models.llavanextvisionwrapper method)": [[84, "tensorrt_llm.models.LlavaNextVisionWrapper.save_checkpoint", false]], "save_checkpoint() (tensorrt_llm.models.pretrainedmodel method)": [[84, "tensorrt_llm.models.PretrainedModel.save_checkpoint", false]], "scatter() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.scatter", false]], "scatter_nd() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.scatter_nd", false]], "schedulerconfig (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.SchedulerConfig", false]], "sd35adalayernormzerox (class in tensorrt_llm.layers.normalization)": [[83, "tensorrt_llm.layers.normalization.SD35AdaLayerNormZeroX", false]], "sd3patchembed (class in tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.SD3PatchEmbed", false]], "sd3transformer2dmodel (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.SD3Transformer2DModel", false]], "secondary_offload_min_priority (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.secondary_offload_min_priority", false]], "seed (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.seed", false]], "select() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.select", false]], "select() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.select", false]], "selective_scan() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.selective_scan", false]], "send() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.send", false]], "serialize_engine() (tensorrt_llm.runtime.modelrunner method)": [[87, "tensorrt_llm.runtime.ModelRunner.serialize_engine", false]], "session (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.Session", false]], "set_attn_processor() (tensorrt_llm.models.sd3transformer2dmodel method)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.set_attn_processor", false]], "set_from_optional (c macro)": [[1, "c.SET_FROM_OPTIONAL", false]], "set_if_not_exist() (tensorrt_llm.models.pretrainedconfig method)": [[84, "tensorrt_llm.models.PretrainedConfig.set_if_not_exist", false]], "set_rank() (tensorrt_llm.models.pretrainedconfig method)": [[84, "tensorrt_llm.models.PretrainedConfig.set_rank", false]], "set_rel_attn_table() (tensorrt_llm.layers.attention.attention method)": [[83, "tensorrt_llm.layers.attention.Attention.set_rel_attn_table", false]], "set_shapes() (tensorrt_llm.runtime.session method)": [[87, "tensorrt_llm.runtime.Session.set_shapes", false]], "setup() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.setup", false]], "setup_fake_prompts() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts", false]], "setup_fake_prompts_qwen2vl() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_qwen2vl", false]], "setup_fake_prompts_vila() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.setup_fake_prompts_vila", false]], "setup_inputs() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.setup_inputs", false]], "shape (tensorrt_llm.functional.tensor property)": [[82, "tensorrt_llm.functional.Tensor.shape", false]], "shape (tensorrt_llm.runtime.tensorinfo attribute)": [[87, "tensorrt_llm.runtime.TensorInfo.shape", false]], "shape() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.shape", false]], "shutdown() (tensorrt_llm.llmapi.llm method)": [[70, "tensorrt_llm.llmapi.LLM.shutdown", false]], "shutdown() (tensorrt_llm.llmapi.mpicommsession method)": [[70, "tensorrt_llm.llmapi.MpiCommSession.shutdown", false]], "sidestreamidtype (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.SideStreamIDType", false]], "sigmoid() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.sigmoid", false]], "silu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.silu", false]], "sin() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.sin", false]], "sink_token_length (tensorrt_llm.llmapi.kvcacheconfig attribute)": [[70, "tensorrt_llm.llmapi.KvCacheConfig.sink_token_length", false]], "sink_token_length (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.sink_token_length", false]], "size (tensorrt_llm.functional.sliceinputtype attribute)": [[82, "tensorrt_llm.functional.SliceInputType.size", false]], "size() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.size", false]], "skip_cross_attn_blocks (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.skip_cross_attn_blocks", false]], "skip_cross_kv (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.skip_cross_kv", false]], "skip_special_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.skip_special_tokens", false]], "slice() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.slice", false]], "sliceinputtype (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.SliceInputType", false]], "sliding_window_causal (tensorrt_llm.functional.attentionmasktype attribute)": [[82, "tensorrt_llm.functional.AttentionMaskType.sliding_window_causal", false]], "smoothquant_val (tensorrt_llm.llmapi.quantconfig attribute)": [[70, "tensorrt_llm.llmapi.QuantConfig.smoothquant_val", false]], "softmax() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.softmax", false]], "softplus() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.softplus", false]], "spaces_between_special_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.spaces_between_special_tokens", false]], "specdecodingparams (class in tensorrt_llm.layers.attention)": [[83, "tensorrt_llm.layers.attention.SpecDecodingParams", false]], "speculative_decoding_mode (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.speculative_decoding_mode", false]], "speculativedecodingmode (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.SpeculativeDecodingMode", false]], "split() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.split", false]], "split() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.split", false]], "split_prompt_by_images() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.split_prompt_by_images", false]], "sqrt() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.sqrt", false]], "sqrt() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.sqrt", false]], "squared_relu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.squared_relu", false]], "squeeze() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.squeeze", false]], "squeeze() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.squeeze", false]], "squeeze() (tensorrt_llm.runtime.tensorinfo method)": [[87, "tensorrt_llm.runtime.TensorInfo.squeeze", false]], "stack() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.stack", false]], "start (tensorrt_llm.functional.sliceinputtype attribute)": [[82, "tensorrt_llm.functional.SliceInputType.start", false]], "state_dtype (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.state_dtype", false]], "state_dtype (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.state_dtype", false]], "state_size (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.state_size", false]], "state_size (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.state_size", false]], "static (tensorrt_llm.llmapi.batchingtype attribute)": [[70, "tensorrt_llm.llmapi.BatchingType.STATIC", false]], "static_batch (tensorrt_llm.llmapi.capacityschedulerpolicy attribute)": [[70, "tensorrt_llm.llmapi.CapacitySchedulerPolicy.STATIC_BATCH", false]], "step() (tensorrt_llm.runtime.kvcachemanager method)": [[87, "tensorrt_llm.runtime.KVCacheManager.step", false]], "stop (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.stop", false]], "stop_reason (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.stop_reason", false]], "stop_token_ids (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.stop_token_ids", false]], "stop_words_list (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.stop_words_list", false]], "stoppingcriteria (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.StoppingCriteria", false]], "stoppingcriterialist (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.StoppingCriteriaList", false]], "stride (tensorrt_llm.functional.sliceinputtype attribute)": [[82, "tensorrt_llm.functional.SliceInputType.stride", false]], "strongly_typed (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.strongly_typed", false]], "structural_tag (tensorrt_llm.llmapi.guideddecodingparams attribute)": [[70, "tensorrt_llm.llmapi.GuidedDecodingParams.structural_tag", false]], "sub() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.sub", false]], "submit() (tensorrt_llm.llmapi.mpicommsession method)": [[70, "tensorrt_llm.llmapi.MpiCommSession.submit", false]], "submit_sync() (tensorrt_llm.llmapi.mpicommsession method)": [[70, "tensorrt_llm.llmapi.MpiCommSession.submit_sync", false]], "sum() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.sum", false]], "swiglu() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.swiglu", false]], "tanh() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.tanh", false]], "temperature (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.temperature", false]], "temperature (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.temperature", false]], "tensor (class in tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.Tensor", false]], "tensorinfo (class in tensorrt_llm.runtime)": [[87, "tensorrt_llm.runtime.TensorInfo", false]], "tensorrt_llm": [[82, "module-tensorrt_llm", false], [83, "module-tensorrt_llm", false], [84, "module-tensorrt_llm", false], [85, "module-tensorrt_llm", false], [86, "module-tensorrt_llm", false], [87, "module-tensorrt_llm", false]], "tensorrt_llm (c++ type)": [[0, "_CPPv412tensorrt_llm", false], [1, "_CPPv412tensorrt_llm", false]], "tensorrt_llm.functional": [[82, "module-tensorrt_llm.functional", false]], "tensorrt_llm.layers.activation": [[83, "module-tensorrt_llm.layers.activation", false]], "tensorrt_llm.layers.attention": [[83, "module-tensorrt_llm.layers.attention", false]], "tensorrt_llm.layers.cast": [[83, "module-tensorrt_llm.layers.cast", false]], "tensorrt_llm.layers.conv": [[83, "module-tensorrt_llm.layers.conv", false]], "tensorrt_llm.layers.embedding": [[83, "module-tensorrt_llm.layers.embedding", false]], "tensorrt_llm.layers.linear": [[83, "module-tensorrt_llm.layers.linear", false]], "tensorrt_llm.layers.mlp": [[83, "module-tensorrt_llm.layers.mlp", false]], "tensorrt_llm.layers.normalization": [[83, "module-tensorrt_llm.layers.normalization", false]], "tensorrt_llm.layers.pooling": [[83, "module-tensorrt_llm.layers.pooling", false]], "tensorrt_llm.models": [[84, "module-tensorrt_llm.models", false]], "tensorrt_llm.plugin": [[85, "module-tensorrt_llm.plugin", false]], "tensorrt_llm.quantization": [[86, "module-tensorrt_llm.quantization", false]], "tensorrt_llm.runtime": [[87, "module-tensorrt_llm.runtime", false]], "tensorrt_llm::batch_manager (c++ type)": [[0, "_CPPv4N12tensorrt_llm13batch_managerE", false], [1, "_CPPv4N12tensorrt_llm13batch_managerE", false]], "tensorrt_llm::batch_manager::kv_cache_manager (c++ type)": [[0, "_CPPv4N12tensorrt_llm13batch_manager16kv_cache_managerE", false]], "tensorrt_llm::executor (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executorE", false]], "tensorrt_llm::executor::additionalmodeloutput (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutputE", false]], "tensorrt_llm::executor::additionalmodeloutput::additionalmodeloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput21AdditionalModelOutputENSt6stringEb", false]], "tensorrt_llm::executor::additionalmodeloutput::gathercontext (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput13gatherContextE", false]], "tensorrt_llm::executor::additionalmodeloutput::name (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput4nameE", false]], "tensorrt_llm::executor::additionalmodeloutput::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor21AdditionalModelOutputeqERK21AdditionalModelOutput", false]], "tensorrt_llm::executor::additionaloutput (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputE", false]], "tensorrt_llm::executor::additionaloutput::additionaloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputENSt6stringE6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERK16AdditionalOutput", false], [0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERR16AdditionalOutput", false]], "tensorrt_llm::executor::additionaloutput::name (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput4nameE", false]], "tensorrt_llm::executor::additionaloutput::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERK16AdditionalOutput", false], [0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERR16AdditionalOutput", false]], "tensorrt_llm::executor::additionaloutput::output (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput6outputE", false]], "tensorrt_llm::executor::additionaloutput::~additionaloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputD0Ev", false]], "tensorrt_llm::executor::batchingtype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor12BatchingTypeE", false]], "tensorrt_llm::executor::batchingtype::kinflight (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12BatchingType9kINFLIGHTE", false]], "tensorrt_llm::executor::batchingtype::kstatic (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12BatchingType7kSTATICE", false]], "tensorrt_llm::executor::beamtokens (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor10BeamTokensE", false]], "tensorrt_llm::executor::bufferview (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor10BufferViewE", false]], "tensorrt_llm::executor::cachetransceiverconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfigE", false]], "tensorrt_llm::executor::cachetransceiverconfig::cachetransceiverconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig22CacheTransceiverConfigENSt8optionalI6size_tEE", false]], "tensorrt_llm::executor::cachetransceiverconfig::getmaxnumtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfig15getMaxNumTokensEv", false]], "tensorrt_llm::executor::cachetransceiverconfig::mmaxnumtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig13mMaxNumTokensE", false]], "tensorrt_llm::executor::cachetransceiverconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfigeqERK22CacheTransceiverConfig", false]], "tensorrt_llm::executor::cachetransceiverconfig::setmaxnumtokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig15setMaxNumTokensE6size_t", false]], "tensorrt_llm::executor::capacityschedulerpolicy (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicyE", false]], "tensorrt_llm::executor::capacityschedulerpolicy::kguaranteed_no_evict (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy20kGUARANTEED_NO_EVICTE", false]], "tensorrt_llm::executor::capacityschedulerpolicy::kmax_utilization (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy16kMAX_UTILIZATIONE", false]], "tensorrt_llm::executor::capacityschedulerpolicy::kstatic_batch (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy13kSTATIC_BATCHE", false]], "tensorrt_llm::executor::communicationmode (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationModeE", false]], "tensorrt_llm::executor::communicationmode::kleader (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationMode7kLEADERE", false]], "tensorrt_llm::executor::communicationmode::korchestrator (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationMode13kORCHESTRATORE", false]], "tensorrt_llm::executor::communicationtype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationTypeE", false]], "tensorrt_llm::executor::communicationtype::kmpi (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor17CommunicationType4kMPIE", false]], "tensorrt_llm::executor::contextchunkingpolicy (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicyE", false]], "tensorrt_llm::executor::contextchunkingpolicy::kequal_progress (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicy15kEQUAL_PROGRESSE", false]], "tensorrt_llm::executor::contextchunkingpolicy::kfirst_come_first_served (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicy24kFIRST_COME_FIRST_SERVEDE", false]], "tensorrt_llm::executor::contextphaseparams (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsE", false]], "tensorrt_llm::executor::contextphaseparams::contextphaseparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsERK18ContextPhaseParams", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsERR18ContextPhaseParams", false]], "tensorrt_llm::executor::contextphaseparams::deleter (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams7deleterEPKv", false]], "tensorrt_llm::executor::contextphaseparams::getdrafttokens (c++ function)": [[0, "_CPPv4NKR12tensorrt_llm8executor18ContextPhaseParams14getDraftTokensEv", false]], "tensorrt_llm::executor::contextphaseparams::getfirstgentokens (c++ function)": [[0, "_CPPv4NKR12tensorrt_llm8executor18ContextPhaseParams17getFirstGenTokensEv", false]], "tensorrt_llm::executor::contextphaseparams::getreqid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams8getReqIdEv", false]], "tensorrt_llm::executor::contextphaseparams::getserializedstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams18getSerializedStateEv", false]], "tensorrt_llm::executor::contextphaseparams::getstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams8getStateEv", false], [0, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams8getStateEv", false]], "tensorrt_llm::executor::contextphaseparams::mdrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams12mDraftTokensE", false]], "tensorrt_llm::executor::contextphaseparams::mfirstgentokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams15mFirstGenTokensE", false]], "tensorrt_llm::executor::contextphaseparams::mreqid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams6mReqIdE", false]], "tensorrt_llm::executor::contextphaseparams::mstate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams6mStateE", false]], "tensorrt_llm::executor::contextphaseparams::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsaSERK18ContextPhaseParams", false], [0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsaSERR18ContextPhaseParams", false]], "tensorrt_llm::executor::contextphaseparams::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParamseqERK18ContextPhaseParams", false]], "tensorrt_llm::executor::contextphaseparams::popfirstgentokens (c++ function)": [[0, "_CPPv4NO12tensorrt_llm8executor18ContextPhaseParams17popFirstGenTokensEv", false]], "tensorrt_llm::executor::contextphaseparams::releasestate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams12releaseStateEv", false]], "tensorrt_llm::executor::contextphaseparams::requestidtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams13RequestIdTypeE", false]], "tensorrt_llm::executor::contextphaseparams::stateptr (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams8StatePtrE", false]], "tensorrt_llm::executor::contextphaseparams::~contextphaseparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsD0Ev", false]], "tensorrt_llm::executor::datatransceiverstate (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverStateE", false]], "tensorrt_llm::executor::datatransceiverstate::datatransceiverstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEN8kv_cache10CacheStateEN8kv_cache9CommStateE", false], [0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEv", false]], "tensorrt_llm::executor::datatransceiverstate::getcachestate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState13getCacheStateEv", false]], "tensorrt_llm::executor::datatransceiverstate::getcommstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState12getCommStateEv", false]], "tensorrt_llm::executor::datatransceiverstate::mcachestate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState11mCacheStateE", false]], "tensorrt_llm::executor::datatransceiverstate::mcommstate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState10mCommStateE", false]], "tensorrt_llm::executor::datatransceiverstate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverStateeqERK20DataTransceiverState", false]], "tensorrt_llm::executor::datatransceiverstate::setcachestate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState13setCacheStateEN8kv_cache10CacheStateE", false]], "tensorrt_llm::executor::datatransceiverstate::setcommstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState12setCommStateEN8kv_cache9CommStateE", false]], "tensorrt_llm::executor::datatransceiverstate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState8toStringEv", false]], "tensorrt_llm::executor::datatype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor8DataTypeE", false]], "tensorrt_llm::executor::datatype::kbf16 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kBF16E", false]], "tensorrt_llm::executor::datatype::kbool (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kBOOLE", false]], "tensorrt_llm::executor::datatype::kfp16 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kFP16E", false]], "tensorrt_llm::executor::datatype::kfp32 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kFP32E", false]], "tensorrt_llm::executor::datatype::kfp8 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType4kFP8E", false]], "tensorrt_llm::executor::datatype::kint32 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType6kINT32E", false]], "tensorrt_llm::executor::datatype::kint64 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType6kINT64E", false]], "tensorrt_llm::executor::datatype::kint8 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType5kINT8E", false]], "tensorrt_llm::executor::datatype::kuint8 (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType6kUINT8E", false]], "tensorrt_llm::executor::datatype::kunknown (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8DataType8kUNKNOWNE", false]], "tensorrt_llm::executor::debugconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfigE", false]], "tensorrt_llm::executor::debugconfig::debugconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", false]], "tensorrt_llm::executor::debugconfig::getdebuginputtensors (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfig20getDebugInputTensorsEv", false]], "tensorrt_llm::executor::debugconfig::getdebugoutputtensors (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfig21getDebugOutputTensorsEv", false]], "tensorrt_llm::executor::debugconfig::getdebugtensornames (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfig19getDebugTensorNamesEv", false]], "tensorrt_llm::executor::debugconfig::getdebugtensorsmaxiterations (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfig28getDebugTensorsMaxIterationsEv", false]], "tensorrt_llm::executor::debugconfig::mdebuginputtensors (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig18mDebugInputTensorsE", false]], "tensorrt_llm::executor::debugconfig::mdebugoutputtensors (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig19mDebugOutputTensorsE", false]], "tensorrt_llm::executor::debugconfig::mdebugtensornames (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig17mDebugTensorNamesE", false]], "tensorrt_llm::executor::debugconfig::mdebugtensorsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig26mDebugTensorsMaxIterationsE", false]], "tensorrt_llm::executor::debugconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11DebugConfigeqERK11DebugConfig", false]], "tensorrt_llm::executor::debugconfig::setdebuginputtensors (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig20setDebugInputTensorsEb", false]], "tensorrt_llm::executor::debugconfig::setdebugoutputtensors (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig21setDebugOutputTensorsEb", false]], "tensorrt_llm::executor::debugconfig::setdebugtensornames (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig19setDebugTensorNamesERK9StringVec", false]], "tensorrt_llm::executor::debugconfig::setdebugtensorsmaxiterations (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig28setDebugTensorsMaxIterationsE10SizeType32", false]], "tensorrt_llm::executor::debugconfig::stringvec (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor11DebugConfig9StringVecE", false]], "tensorrt_llm::executor::debugtensorsperiteration (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIterationE", false]], "tensorrt_llm::executor::debugtensorsperiteration::debugtensors (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIteration12debugTensorsE", false]], "tensorrt_llm::executor::debugtensorsperiteration::iter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIteration4iterE", false]], "tensorrt_llm::executor::decodingconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfigE", false]], "tensorrt_llm::executor::decodingconfig::decodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", false]], "tensorrt_llm::executor::decodingconfig::enableseamlesslookaheaddecoding (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig31enableSeamlessLookaheadDecodingEv", false]], "tensorrt_llm::executor::decodingconfig::getdecodingmode (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig15getDecodingModeEv", false]], "tensorrt_llm::executor::decodingconfig::geteagleconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig14getEagleConfigEv", false]], "tensorrt_llm::executor::decodingconfig::getlookaheaddecodingconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig26getLookaheadDecodingConfigEv", false]], "tensorrt_llm::executor::decodingconfig::getlookaheaddecodingmaxnumrequest (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig33getLookaheadDecodingMaxNumRequestEv", false]], "tensorrt_llm::executor::decodingconfig::getmedusachoices (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig16getMedusaChoicesEv", false]], "tensorrt_llm::executor::decodingconfig::mdecodingmode (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig13mDecodingModeE", false]], "tensorrt_llm::executor::decodingconfig::meagleconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig12mEagleConfigE", false]], "tensorrt_llm::executor::decodingconfig::mlookaheaddecodingconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig24mLookaheadDecodingConfigE", false]], "tensorrt_llm::executor::decodingconfig::mlookaheaddecodingmaxnumrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig31mLookaheadDecodingMaxNumRequestE", false]], "tensorrt_llm::executor::decodingconfig::mmedusachoices (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14mMedusaChoicesE", false]], "tensorrt_llm::executor::decodingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14DecodingConfigeqERK14DecodingConfig", false]], "tensorrt_llm::executor::decodingconfig::setdecodingmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig15setDecodingModeERK12DecodingMode", false]], "tensorrt_llm::executor::decodingconfig::seteagleconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14setEagleConfigERK11EagleConfig", false]], "tensorrt_llm::executor::decodingconfig::setlookaheaddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig26setLookaheadDecodingConfigERK23LookaheadDecodingConfig", false]], "tensorrt_llm::executor::decodingconfig::setmedusachoices (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14DecodingConfig16setMedusaChoicesERK13MedusaChoices", false]], "tensorrt_llm::executor::decodingmode (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingModeE", false]], "tensorrt_llm::executor::decodingmode::allbitset (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9allBitSetE14UnderlyingType", false]], "tensorrt_llm::executor::decodingmode::anybitset (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9anyBitSetE14UnderlyingType", false]], "tensorrt_llm::executor::decodingmode::auto (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode4AutoEv", false]], "tensorrt_llm::executor::decodingmode::beamsearch (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode10BeamSearchEv", false]], "tensorrt_llm::executor::decodingmode::decodingmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12DecodingModeE14UnderlyingType", false]], "tensorrt_llm::executor::decodingmode::eagle (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode5EagleEv", false]], "tensorrt_llm::executor::decodingmode::explicitdrafttokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode19ExplicitDraftTokensEv", false]], "tensorrt_llm::executor::decodingmode::externaldrafttokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode19ExternalDraftTokensEv", false]], "tensorrt_llm::executor::decodingmode::getname (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode7getNameEv", false]], "tensorrt_llm::executor::decodingmode::getstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode8getStateEv", false]], "tensorrt_llm::executor::decodingmode::isauto (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isAutoEv", false]], "tensorrt_llm::executor::decodingmode::isbeamsearch (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isBeamSearchEv", false]], "tensorrt_llm::executor::decodingmode::iseagle (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode7isEagleEv", false]], "tensorrt_llm::executor::decodingmode::isexplicitdrafttokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isExplicitDraftTokensEv", false]], "tensorrt_llm::executor::decodingmode::isexternaldrafttokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isExternalDraftTokensEv", false]], "tensorrt_llm::executor::decodingmode::islookahead (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode11isLookaheadEv", false]], "tensorrt_llm::executor::decodingmode::ismedusa (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode8isMedusaEv", false]], "tensorrt_llm::executor::decodingmode::istopk (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isTopKEv", false]], "tensorrt_llm::executor::decodingmode::istopkandtopp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode13isTopKandTopPEv", false]], "tensorrt_llm::executor::decodingmode::istopkortopp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isTopKorTopPEv", false]], "tensorrt_llm::executor::decodingmode::istopp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isTopPEv", false]], "tensorrt_llm::executor::decodingmode::isusebantokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseBanTokensEv", false]], "tensorrt_llm::executor::decodingmode::isusebanwords (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode13isUseBanWordsEv", false]], "tensorrt_llm::executor::decodingmode::isuseexpliciteosstop (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode20isUseExplicitEosStopEv", false]], "tensorrt_llm::executor::decodingmode::isusefrequencypenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isUseFrequencyPenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isusemaxlengthstop (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode18isUseMaxLengthStopEv", false]], "tensorrt_llm::executor::decodingmode::isuseminlength (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseMinLengthEv", false]], "tensorrt_llm::executor::decodingmode::isuseminp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9isUseMinPEv", false]], "tensorrt_llm::executor::decodingmode::isusenorepeatngramsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseNoRepeatNgramSizeEv", false]], "tensorrt_llm::executor::decodingmode::isuseoccurrencepenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseOccurrencePenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isusepenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isUsePenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isusepresencepenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode20isUsePresencePenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isuserepetitionpenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseRepetitionPenaltyEv", false]], "tensorrt_llm::executor::decodingmode::isusestopcriteria (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode17isUseStopCriteriaEv", false]], "tensorrt_llm::executor::decodingmode::isusestopwords (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseStopWordsEv", false]], "tensorrt_llm::executor::decodingmode::isusetemperature (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode16isUseTemperatureEv", false]], "tensorrt_llm::executor::decodingmode::isusevariablebeamwidthsearch (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingMode28isUseVariableBeamWidthSearchEv", false]], "tensorrt_llm::executor::decodingmode::kauto (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kAutoE", false]], "tensorrt_llm::executor::decodingmode::kbeamsearch (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode11kBeamSearchE", false]], "tensorrt_llm::executor::decodingmode::keagle (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode6kEagleE", false]], "tensorrt_llm::executor::decodingmode::kexplicitdrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode20kExplicitDraftTokensE", false]], "tensorrt_llm::executor::decodingmode::kexternaldrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode20kExternalDraftTokensE", false]], "tensorrt_llm::executor::decodingmode::klookahead (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode10kLookaheadE", false]], "tensorrt_llm::executor::decodingmode::kmedusa (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode7kMedusaE", false]], "tensorrt_llm::executor::decodingmode::knumflags (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode9kNumFlagsE", false]], "tensorrt_llm::executor::decodingmode::ktopk (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kTopKE", false]], "tensorrt_llm::executor::decodingmode::ktopktopp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode9kTopKTopPE", false]], "tensorrt_llm::executor::decodingmode::ktopp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kTopPE", false]], "tensorrt_llm::executor::decodingmode::kusebantokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseBanTokensE", false]], "tensorrt_llm::executor::decodingmode::kusebanwords (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12kUseBanWordsE", false]], "tensorrt_llm::executor::decodingmode::kuseexpliciteosstop (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode19kUseExplicitEosStopE", false]], "tensorrt_llm::executor::decodingmode::kusefrequencypenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode22kUseFrequencyPenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kusemaxlengthstop (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode17kUseMaxLengthStopE", false]], "tensorrt_llm::executor::decodingmode::kuseminlength (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseMinLengthE", false]], "tensorrt_llm::executor::decodingmode::kuseminp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode8kUseMinPE", false]], "tensorrt_llm::executor::decodingmode::kusenorepeatngramsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode21kUseNoRepeatNgramSizeE", false]], "tensorrt_llm::executor::decodingmode::kuseoccurrencepenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode23kUseOccurrencePenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kusepenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUsePenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kusepresencepenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode21kUsePresencePenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kuserepetitionpenalties (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode23kUseRepetitionPenaltiesE", false]], "tensorrt_llm::executor::decodingmode::kusestandardstopcriteria (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode24kUseStandardStopCriteriaE", false]], "tensorrt_llm::executor::decodingmode::kusestopwords (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseStopWordsE", false]], "tensorrt_llm::executor::decodingmode::kusetemperature (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode15kUseTemperatureE", false]], "tensorrt_llm::executor::decodingmode::kusevariablebeamwidthsearch (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode27kUseVariableBeamWidthSearchE", false]], "tensorrt_llm::executor::decodingmode::lookahead (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode9LookaheadEv", false]], "tensorrt_llm::executor::decodingmode::medusa (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode6MedusaEv", false]], "tensorrt_llm::executor::decodingmode::mstate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode6mStateE", false]], "tensorrt_llm::executor::decodingmode::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor12DecodingModeeqERK12DecodingMode", false]], "tensorrt_llm::executor::decodingmode::setbitto (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode8setBitToE14UnderlyingTypeb", false]], "tensorrt_llm::executor::decodingmode::topk (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode4TopKEv", false]], "tensorrt_llm::executor::decodingmode::topktopp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode8TopKTopPEv", false]], "tensorrt_llm::executor::decodingmode::topp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode4TopPEv", false]], "tensorrt_llm::executor::decodingmode::underlyingtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode14UnderlyingTypeE", false]], "tensorrt_llm::executor::decodingmode::usebantokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useBanTokensEb", false]], "tensorrt_llm::executor::decodingmode::usebanwords (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode11useBanWordsEb", false]], "tensorrt_llm::executor::decodingmode::useexpliciteosstop (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode18useExplicitEosStopEb", false]], "tensorrt_llm::executor::decodingmode::usefrequencypenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode19useFrequencyPenaltyEb", false]], "tensorrt_llm::executor::decodingmode::usemaxlengthstop (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode16useMaxLengthStopEb", false]], "tensorrt_llm::executor::decodingmode::useminlength (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useMinLengthEb", false]], "tensorrt_llm::executor::decodingmode::useminp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode7useMinPEb", false]], "tensorrt_llm::executor::decodingmode::usenorepeatngramsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useNoRepeatNgramSizeEb", false]], "tensorrt_llm::executor::decodingmode::useoccurrencepenalties (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode22useOccurrencePenaltiesEb", false]], "tensorrt_llm::executor::decodingmode::usepresencepenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode18usePresencePenaltyEb", false]], "tensorrt_llm::executor::decodingmode::userepetitionpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useRepetitionPenaltyEb", false]], "tensorrt_llm::executor::decodingmode::usestopwords (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useStopWordsEb", false]], "tensorrt_llm::executor::decodingmode::usetemperature (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode14useTemperatureEb", false]], "tensorrt_llm::executor::decodingmode::usevariablebeamwidthsearch (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12DecodingMode26useVariableBeamWidthSearchEb", false]], "tensorrt_llm::executor::detail (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6detailE", false]], "tensorrt_llm::executor::detail::dimtype64 (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6detail9DimType64E", false]], "tensorrt_llm::executor::detail::ofitensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", false]], "tensorrt_llm::executor::detail::toitensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6detail9toITensorERK6Tensor", false]], "tensorrt_llm::executor::disagg_executor (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executorE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestratorE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::awaitcontextresponses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator21awaitContextResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::awaitgenerationresponses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator24awaitGenerationResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::canenqueue (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator10canEnqueueEv", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::disaggexecutororchestrator (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::enqueuecontext (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::enqueuegeneration (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::getcontextexecutors (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator19getContextExecutorsEv", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::getgenexecutors (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator15getGenExecutorsEv", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::mimpl (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator5mImplE", false]], "tensorrt_llm::executor::disagg_executor::disaggexecutororchestrator::~disaggexecutororchestrator (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestratorD0Ev", false]], "tensorrt_llm::executor::disagg_executor::responsewithid (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdE", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::gid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId3gidE", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERK14ResponseWithId", false], [0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERR14ResponseWithId", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::response (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId8responseE", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::responsewithid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERK14ResponseWithId", false], [0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERKN12tensorrt_llm8executor8ResponseE6IdType", false], [0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERR14ResponseWithId", false], [0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERRN12tensorrt_llm8executor8ResponseE6IdType", false]], "tensorrt_llm::executor::disagg_executor::responsewithid::~responsewithid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdD0Ev", false]], "tensorrt_llm::executor::disservingrequeststats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStatsE", false]], "tensorrt_llm::executor::disservingrequeststats::kvcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStats11kvCacheSizeE", false]], "tensorrt_llm::executor::disservingrequeststats::kvcachetransferms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStats17kvCacheTransferMSE", false]], "tensorrt_llm::executor::dynamicbatchconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfigE", false]], "tensorrt_llm::executor::dynamicbatchconfig::dynamicbatchconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", false]], "tensorrt_llm::executor::dynamicbatchconfig::getbatchsizetable (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig17getBatchSizeTableEv", false]], "tensorrt_llm::executor::dynamicbatchconfig::getdynamicbatchmovingaveragewindow (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig34getDynamicBatchMovingAverageWindowEv", false]], "tensorrt_llm::executor::dynamicbatchconfig::getenablebatchsizetuning (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig24getEnableBatchSizeTuningEv", false]], "tensorrt_llm::executor::dynamicbatchconfig::getenablemaxnumtokenstuning (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig27getEnableMaxNumTokensTuningEv", false]], "tensorrt_llm::executor::dynamicbatchconfig::kdefaultbatchsizetable (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig22kDefaultBatchSizeTableE", false]], "tensorrt_llm::executor::dynamicbatchconfig::kdefaultdynamicbatchmovingaveragewindow (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig39kDefaultDynamicBatchMovingAverageWindowE", false]], "tensorrt_llm::executor::dynamicbatchconfig::mbatchsizetable (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig15mBatchSizeTableE", false]], "tensorrt_llm::executor::dynamicbatchconfig::mdynamicbatchmovingaveragewindow (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig32mDynamicBatchMovingAverageWindowE", false]], "tensorrt_llm::executor::dynamicbatchconfig::menablebatchsizetuning (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig22mEnableBatchSizeTuningE", false]], "tensorrt_llm::executor::dynamicbatchconfig::menablemaxnumtokenstuning (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig25mEnableMaxNumTokensTuningE", false]], "tensorrt_llm::executor::eaglechoices (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor12EagleChoicesE", false]], "tensorrt_llm::executor::eagleconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfigE", false]], "tensorrt_llm::executor::eagleconfig::checkposteriorvalue (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig19checkPosteriorValueERKNSt8optionalIfEE", false]], "tensorrt_llm::executor::eagleconfig::eagleconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::eagleconfig::getdynamictreemaxtopk (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig21getDynamicTreeMaxTopKEv", false]], "tensorrt_llm::executor::eagleconfig::geteaglechoices (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig15getEagleChoicesEv", false]], "tensorrt_llm::executor::eagleconfig::getposteriorthreshold (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig21getPosteriorThresholdEv", false]], "tensorrt_llm::executor::eagleconfig::isgreedysampling (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig16isGreedySamplingEv", false]], "tensorrt_llm::executor::eagleconfig::mdynamictreemaxtopk (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig19mDynamicTreeMaxTopKE", false]], "tensorrt_llm::executor::eagleconfig::meaglechoices (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig13mEagleChoicesE", false]], "tensorrt_llm::executor::eagleconfig::mgreedysampling (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig15mGreedySamplingE", false]], "tensorrt_llm::executor::eagleconfig::mposteriorthreshold (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig19mPosteriorThresholdE", false]], "tensorrt_llm::executor::eagleconfig::musedynamictree (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11EagleConfig15mUseDynamicTreeE", false]], "tensorrt_llm::executor::eagleconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfigeqERK11EagleConfig", false]], "tensorrt_llm::executor::eagleconfig::usedynamictree (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11EagleConfig14useDynamicTreeEv", false]], "tensorrt_llm::executor::executor (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8ExecutorE", false]], "tensorrt_llm::executor::executor::awaitresponses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERK6IdTypeRKNSt8optionalINSt6chrono12millisecondsEEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt6vectorI6IdTypeEERKNSt8optionalINSt6chrono12millisecondsEEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt8optionalINSt6chrono12millisecondsEEE", false]], "tensorrt_llm::executor::executor::cancelrequest (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor13cancelRequestE6IdType", false]], "tensorrt_llm::executor::executor::canenqueuerequests (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Executor18canEnqueueRequestsEv", false]], "tensorrt_llm::executor::executor::enqueuerequest (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor14enqueueRequestERK7Request", false]], "tensorrt_llm::executor::executor::enqueuerequests (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor15enqueueRequestsERKNSt6vectorI7RequestEE", false]], "tensorrt_llm::executor::executor::executor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEERK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK8Executor", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERR8Executor", false]], "tensorrt_llm::executor::executor::getkvcacheeventmanager (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Executor22getKVCacheEventManagerEv", false]], "tensorrt_llm::executor::executor::getlatestdebugtensors (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor21getLatestDebugTensorsEv", false]], "tensorrt_llm::executor::executor::getlatestiterationstats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor23getLatestIterationStatsEv", false]], "tensorrt_llm::executor::executor::getlatestrequeststats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor21getLatestRequestStatsEv", false]], "tensorrt_llm::executor::executor::getnumresponsesready (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Executor20getNumResponsesReadyERKNSt8optionalI6IdTypeEE", false]], "tensorrt_llm::executor::executor::isparticipant (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Executor13isParticipantEv", false]], "tensorrt_llm::executor::executor::mimpl (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor5mImplE", false]], "tensorrt_llm::executor::executor::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERK8Executor", false], [0, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERR8Executor", false]], "tensorrt_llm::executor::executor::shutdown (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Executor8shutdownEv", false]], "tensorrt_llm::executor::executor::~executor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8ExecutorD0Ev", false]], "tensorrt_llm::executor::executorconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfigE", false]], "tensorrt_llm::executor::executorconfig::executorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", false]], "tensorrt_llm::executor::executorconfig::getadditionalmodeloutputs (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getAdditionalModelOutputsEv", false]], "tensorrt_llm::executor::executorconfig::getbatchingtype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getBatchingTypeEv", false]], "tensorrt_llm::executor::executorconfig::getcachetransceiverconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getCacheTransceiverConfigEv", false]], "tensorrt_llm::executor::executorconfig::getdebugconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig14getDebugConfigEv", false]], "tensorrt_llm::executor::executorconfig::getdecodingconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig17getDecodingConfigEv", false]], "tensorrt_llm::executor::executorconfig::getenablechunkedcontext (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig23getEnableChunkedContextEv", false]], "tensorrt_llm::executor::executorconfig::getenabletrtoverlap (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig19getEnableTrtOverlapEv", false]], "tensorrt_llm::executor::executorconfig::getextendedruntimeperfknobconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig32getExtendedRuntimePerfKnobConfigEv", false]], "tensorrt_llm::executor::executorconfig::getgathergenerationlogits (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getGatherGenerationLogitsEv", false]], "tensorrt_llm::executor::executorconfig::getgpuweightspercent (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig20getGpuWeightsPercentEv", false]], "tensorrt_llm::executor::executorconfig::getguideddecodingconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig23getGuidedDecodingConfigEv", false]], "tensorrt_llm::executor::executorconfig::getiterstatsmaxiterations (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getIterStatsMaxIterationsEv", false]], "tensorrt_llm::executor::executorconfig::getkvcacheconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig16getKvCacheConfigEv", false]], "tensorrt_llm::executor::executorconfig::getkvcacheconfigref (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19getKvCacheConfigRefEv", false]], "tensorrt_llm::executor::executorconfig::getlogitspostprocessorconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig28getLogitsPostProcessorConfigEv", false]], "tensorrt_llm::executor::executorconfig::getmaxbatchsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxBatchSizeEv", false]], "tensorrt_llm::executor::executorconfig::getmaxbeamwidth (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxBeamWidthEv", false]], "tensorrt_llm::executor::executorconfig::getmaxnumtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxNumTokensEv", false]], "tensorrt_llm::executor::executorconfig::getmaxqueuesize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxQueueSizeEv", false]], "tensorrt_llm::executor::executorconfig::getmaxseqidlemicroseconds (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getMaxSeqIdleMicrosecondsEv", false]], "tensorrt_llm::executor::executorconfig::getnormalizelogprobs (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig20getNormalizeLogProbsEv", false]], "tensorrt_llm::executor::executorconfig::getparallelconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig17getParallelConfigEv", false]], "tensorrt_llm::executor::executorconfig::getpeftcacheconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig18getPeftCacheConfigEv", false]], "tensorrt_llm::executor::executorconfig::getprompttableoffloading (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig24getPromptTableOffloadingEv", false]], "tensorrt_llm::executor::executorconfig::getrecvpollperiodms (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig19getRecvPollPeriodMsEv", false]], "tensorrt_llm::executor::executorconfig::getrequeststatsmaxiterations (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig28getRequestStatsMaxIterationsEv", false]], "tensorrt_llm::executor::executorconfig::getschedulerconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig18getSchedulerConfigEv", false]], "tensorrt_llm::executor::executorconfig::getschedulerconfigref (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21getSchedulerConfigRefEv", false]], "tensorrt_llm::executor::executorconfig::getspecdecconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig16getSpecDecConfigEv", false]], "tensorrt_llm::executor::executorconfig::getusegpudirectstorage (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig22getUseGpuDirectStorageEv", false]], "tensorrt_llm::executor::executorconfig::kdefaultiterstatsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30kDefaultIterStatsMaxIterationsE", false]], "tensorrt_llm::executor::executorconfig::kdefaultmaxseqidlemicroseconds (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30kDefaultMaxSeqIdleMicrosecondsE", false]], "tensorrt_llm::executor::executorconfig::kdefaultrequeststatsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig33kDefaultRequestStatsMaxIterationsE", false]], "tensorrt_llm::executor::executorconfig::madditionalmodeloutputs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mAdditionalModelOutputsE", false]], "tensorrt_llm::executor::executorconfig::mbatchingtype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mBatchingTypeE", false]], "tensorrt_llm::executor::executorconfig::mcachetransceiverconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mCacheTransceiverConfigE", false]], "tensorrt_llm::executor::executorconfig::mdebugconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig12mDebugConfigE", false]], "tensorrt_llm::executor::executorconfig::mdecodingconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15mDecodingConfigE", false]], "tensorrt_llm::executor::executorconfig::menablechunkedcontext (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21mEnableChunkedContextE", false]], "tensorrt_llm::executor::executorconfig::menabletrtoverlap (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17mEnableTrtOverlapE", false]], "tensorrt_llm::executor::executorconfig::mextendedruntimeperfknobconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30mExtendedRuntimePerfKnobConfigE", false]], "tensorrt_llm::executor::executorconfig::mgathergenerationlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mGatherGenerationLogitsE", false]], "tensorrt_llm::executor::executorconfig::mgpuweightspercent (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18mGpuWeightsPercentE", false]], "tensorrt_llm::executor::executorconfig::mguideddecodingconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21mGuidedDecodingConfigE", false]], "tensorrt_llm::executor::executorconfig::miterstatsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mIterStatsMaxIterationsE", false]], "tensorrt_llm::executor::executorconfig::mkvcacheconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14mKvCacheConfigE", false]], "tensorrt_llm::executor::executorconfig::mlogitspostprocessorconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mLogitsPostProcessorConfigE", false]], "tensorrt_llm::executor::executorconfig::mmaxbatchsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxBatchSizeE", false]], "tensorrt_llm::executor::executorconfig::mmaxbeamwidth (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxBeamWidthE", false]], "tensorrt_llm::executor::executorconfig::mmaxnumtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxNumTokensE", false]], "tensorrt_llm::executor::executorconfig::mmaxqueuesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxQueueSizeE", false]], "tensorrt_llm::executor::executorconfig::mmaxseqidlemicroseconds (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mMaxSeqIdleMicrosecondsE", false]], "tensorrt_llm::executor::executorconfig::mnormalizelogprobs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18mNormalizeLogProbsE", false]], "tensorrt_llm::executor::executorconfig::mparallelconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15mParallelConfigE", false]], "tensorrt_llm::executor::executorconfig::mpeftcacheconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16mPeftCacheConfigE", false]], "tensorrt_llm::executor::executorconfig::mprompttableoffloading (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22mPromptTableOffloadingE", false]], "tensorrt_llm::executor::executorconfig::mrecvpollperiodms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17mRecvPollPeriodMsE", false]], "tensorrt_llm::executor::executorconfig::mrequeststatsmaxiterations (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mRequestStatsMaxIterationsE", false]], "tensorrt_llm::executor::executorconfig::mschedulerconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16mSchedulerConfigE", false]], "tensorrt_llm::executor::executorconfig::mspeculativedecodingconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mSpeculativeDecodingConfigE", false]], "tensorrt_llm::executor::executorconfig::musegpudirectstorage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20mUseGpuDirectStorageE", false]], "tensorrt_llm::executor::executorconfig::setadditionalmodeloutputs (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setAdditionalModelOutputsERKNSt6vectorI21AdditionalModelOutputEE", false]], "tensorrt_llm::executor::executorconfig::setbatchingtype (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setBatchingTypeE12BatchingType", false]], "tensorrt_llm::executor::executorconfig::setcachetransceiverconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setCacheTransceiverConfigERK22CacheTransceiverConfig", false]], "tensorrt_llm::executor::executorconfig::setdebugconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14setDebugConfigERK11DebugConfig", false]], "tensorrt_llm::executor::executorconfig::setdecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setDecodingConfigERK14DecodingConfig", false]], "tensorrt_llm::executor::executorconfig::setenablechunkedcontext (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setEnableChunkedContextEb", false]], "tensorrt_llm::executor::executorconfig::setenabletrtoverlap (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setEnableTrtOverlapEb", false]], "tensorrt_llm::executor::executorconfig::setextendedruntimeperfknobconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig32setExtendedRuntimePerfKnobConfigERK29ExtendedRuntimePerfKnobConfig", false]], "tensorrt_llm::executor::executorconfig::setgathergenerationlogits (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setGatherGenerationLogitsEb", false]], "tensorrt_llm::executor::executorconfig::setgpuweightspercent (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setGpuWeightsPercentERKf", false]], "tensorrt_llm::executor::executorconfig::setguideddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setGuidedDecodingConfigERK20GuidedDecodingConfig", false]], "tensorrt_llm::executor::executorconfig::setiterstatsmaxiterations (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setIterStatsMaxIterationsE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setkvcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setKvCacheConfigERK13KvCacheConfig", false]], "tensorrt_llm::executor::executorconfig::setlogitspostprocessorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setLogitsPostProcessorConfigERK25LogitsPostProcessorConfig", false]], "tensorrt_llm::executor::executorconfig::setmaxbatchsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBatchSizeE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setmaxbeamwidth (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBeamWidthE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setmaxnumtokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxNumTokensE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setmaxqueuesize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxQueueSizeERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::executorconfig::setmaxseqidlemicroseconds (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setMaxSeqIdleMicrosecondsE8uint64_t", false]], "tensorrt_llm::executor::executorconfig::setnormalizelogprobs (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setNormalizeLogProbsEb", false]], "tensorrt_llm::executor::executorconfig::setparallelconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setParallelConfigERK14ParallelConfig", false]], "tensorrt_llm::executor::executorconfig::setpeftcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setPeftCacheConfigERK15PeftCacheConfig", false]], "tensorrt_llm::executor::executorconfig::setprompttableoffloading (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig24setPromptTableOffloadingEb", false]], "tensorrt_llm::executor::executorconfig::setrecvpollperiodms (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setRecvPollPeriodMsERK10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setrequeststatsmaxiterations (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setRequestStatsMaxIterationsE10SizeType32", false]], "tensorrt_llm::executor::executorconfig::setschedulerconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setSchedulerConfigERK15SchedulerConfig", false]], "tensorrt_llm::executor::executorconfig::setspecdecconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setSpecDecConfigERK25SpeculativeDecodingConfig", false]], "tensorrt_llm::executor::executorconfig::setusegpudirectstorage (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22setUseGpuDirectStorageERKb", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::extendedruntimeperfknobconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::getcudagraphcachesize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21getCudaGraphCacheSizeEv", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::getcudagraphmode (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16getCudaGraphModeEv", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::getenablecontextfmhafp32acc (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27getEnableContextFMHAFP32AccEv", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::getmultiblockmode (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17getMultiBlockModeEv", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::mcudagraphcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig19mCudaGraphCacheSizeE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::mcudagraphmode (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig14mCudaGraphModeE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::menablecontextfmhafp32acc (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig25mEnableContextFMHAFP32AccE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::mmultiblockmode (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig15mMultiBlockModeE", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigeqERK29ExtendedRuntimePerfKnobConfig", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::setcudagraphcachesize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21setCudaGraphCacheSizeE10SizeType32", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::setcudagraphmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16setCudaGraphModeEb", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::setenablecontextfmhafp32acc (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27setEnableContextFMHAFP32AccEb", false]], "tensorrt_llm::executor::extendedruntimeperfknobconfig::setmultiblockmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17setMultiBlockModeEb", false]], "tensorrt_llm::executor::externaldrafttokensconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfigE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::externaldrafttokensconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::getacceptancethreshold (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig22getAcceptanceThresholdEv", false]], "tensorrt_llm::executor::externaldrafttokensconfig::getfastlogits (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig13getFastLogitsEv", false]], "tensorrt_llm::executor::externaldrafttokensconfig::getlogits (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig9getLogitsEv", false]], "tensorrt_llm::executor::externaldrafttokensconfig::gettokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig9getTokensEv", false]], "tensorrt_llm::executor::externaldrafttokensconfig::macceptancethreshold (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig20mAcceptanceThresholdE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::mfastlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig11mFastLogitsE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::mlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig7mLogitsE", false]], "tensorrt_llm::executor::externaldrafttokensconfig::mtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig7mTokensE", false]], "tensorrt_llm::executor::finishreason (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReasonE", false]], "tensorrt_llm::executor::finishreason::kcancelled (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason10kCANCELLEDE", false]], "tensorrt_llm::executor::finishreason::kend_id (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason7kEND_IDE", false]], "tensorrt_llm::executor::finishreason::klength (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason7kLENGTHE", false]], "tensorrt_llm::executor::finishreason::knot_finished (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason13kNOT_FINISHEDE", false]], "tensorrt_llm::executor::finishreason::kstop_words (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason11kSTOP_WORDSE", false]], "tensorrt_llm::executor::finishreason::ktimed_out (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12FinishReason10kTIMED_OUTE", false]], "tensorrt_llm::executor::floattype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor9FloatTypeE", false]], "tensorrt_llm::executor::guideddecodingconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfigE", false]], "tensorrt_llm::executor::guideddecodingconfig::getbackend (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig10getBackendEv", false]], "tensorrt_llm::executor::guideddecodingconfig::getencodedvocab (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getEncodedVocabEv", false]], "tensorrt_llm::executor::guideddecodingconfig::getstoptokenids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getStopTokenIdsEv", false]], "tensorrt_llm::executor::guideddecodingconfig::gettokenizerstr (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getTokenizerStrEv", false]], "tensorrt_llm::executor::guideddecodingconfig::guideddecodingbackend (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackendE", false]], "tensorrt_llm::executor::guideddecodingconfig::guideddecodingbackend::kxgrammar (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend9kXGRAMMARE", false]], "tensorrt_llm::executor::guideddecodingconfig::guideddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", false]], "tensorrt_llm::executor::guideddecodingconfig::mbackend (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig8mBackendE", false]], "tensorrt_llm::executor::guideddecodingconfig::mencodedvocab (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mEncodedVocabE", false]], "tensorrt_llm::executor::guideddecodingconfig::mstoptokenids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mStopTokenIdsE", false]], "tensorrt_llm::executor::guideddecodingconfig::mtokenizerstr (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mTokenizerStrE", false]], "tensorrt_llm::executor::guideddecodingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfigeqERK20GuidedDecodingConfig", false]], "tensorrt_llm::executor::guideddecodingconfig::setbackend (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig10setBackendERK21GuidedDecodingBackend", false]], "tensorrt_llm::executor::guideddecodingconfig::setencodedvocab (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setEncodedVocabERKNSt6vectorINSt6stringEEE", false]], "tensorrt_llm::executor::guideddecodingconfig::setstoptokenids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setStopTokenIdsERKNSt6vectorI11TokenIdTypeEE", false]], "tensorrt_llm::executor::guideddecodingconfig::settokenizerstr (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setTokenizerStrERKNSt6stringE", false]], "tensorrt_llm::executor::guideddecodingconfig::validate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig8validateEv", false]], "tensorrt_llm::executor::guideddecodingparams (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParamsE", false]], "tensorrt_llm::executor::guideddecodingparams::getguide (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParams8getGuideEv", false]], "tensorrt_llm::executor::guideddecodingparams::getguidetype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParams12getGuideTypeEv", false]], "tensorrt_llm::executor::guideddecodingparams::guideddecodingparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams20GuidedDecodingParamsE9GuideTypeNSt8optionalINSt6stringEEE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideTypeE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kebnf_grammar (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType13kEBNF_GRAMMARE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kjson (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType5kJSONE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kjson_schema (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType12kJSON_SCHEMAE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kregex (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType6kREGEXE", false]], "tensorrt_llm::executor::guideddecodingparams::guidetype::kstructural_tag (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType15kSTRUCTURAL_TAGE", false]], "tensorrt_llm::executor::guideddecodingparams::mguide (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams6mGuideE", false]], "tensorrt_llm::executor::guideddecodingparams::mguidetype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams10mGuideTypeE", false]], "tensorrt_llm::executor::guideddecodingparams::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParamseqERK20GuidedDecodingParams", false]], "tensorrt_llm::executor::idtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6IdTypeE", false]], "tensorrt_llm::executor::inflightbatchingstats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStatsE", false]], "tensorrt_llm::executor::inflightbatchingstats::avgnumdecodedtokensperiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats26avgNumDecodedTokensPerIterE", false]], "tensorrt_llm::executor::inflightbatchingstats::microbatchid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats12microBatchIdE", false]], "tensorrt_llm::executor::inflightbatchingstats::numcontextrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats18numContextRequestsE", false]], "tensorrt_llm::executor::inflightbatchingstats::numctxtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats12numCtxTokensE", false]], "tensorrt_llm::executor::inflightbatchingstats::numgenrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats14numGenRequestsE", false]], "tensorrt_llm::executor::inflightbatchingstats::numpausedrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats17numPausedRequestsE", false]], "tensorrt_llm::executor::inflightbatchingstats::numscheduledrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats20numScheduledRequestsE", false]], "tensorrt_llm::executor::iterationstats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStatsE", false]], "tensorrt_llm::executor::iterationstats::cpumemusage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats11cpuMemUsageE", false]], "tensorrt_llm::executor::iterationstats::crosskvcachestats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats17crossKvCacheStatsE", false]], "tensorrt_llm::executor::iterationstats::gpumemusage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats11gpuMemUsageE", false]], "tensorrt_llm::executor::iterationstats::inflightbatchingstats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats21inflightBatchingStatsE", false]], "tensorrt_llm::executor::iterationstats::iter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats4iterE", false]], "tensorrt_llm::executor::iterationstats::iterlatencyms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats13iterLatencyMSE", false]], "tensorrt_llm::executor::iterationstats::kvcachestats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats12kvCacheStatsE", false]], "tensorrt_llm::executor::iterationstats::maxbatchsizeruntime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats19maxBatchSizeRuntimeE", false]], "tensorrt_llm::executor::iterationstats::maxbatchsizestatic (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats18maxBatchSizeStaticE", false]], "tensorrt_llm::executor::iterationstats::maxbatchsizetunerrecommended (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats28maxBatchSizeTunerRecommendedE", false]], "tensorrt_llm::executor::iterationstats::maxnumactiverequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats20maxNumActiveRequestsE", false]], "tensorrt_llm::executor::iterationstats::maxnumtokensruntime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats19maxNumTokensRuntimeE", false]], "tensorrt_llm::executor::iterationstats::maxnumtokensstatic (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats18maxNumTokensStaticE", false]], "tensorrt_llm::executor::iterationstats::maxnumtokenstunerrecommended (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats28maxNumTokensTunerRecommendedE", false]], "tensorrt_llm::executor::iterationstats::newactiverequestsqueuelatencyms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats31newActiveRequestsQueueLatencyMSE", false]], "tensorrt_llm::executor::iterationstats::numactiverequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats17numActiveRequestsE", false]], "tensorrt_llm::executor::iterationstats::numcompletedrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats20numCompletedRequestsE", false]], "tensorrt_llm::executor::iterationstats::numnewactiverequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats20numNewActiveRequestsE", false]], "tensorrt_llm::executor::iterationstats::numqueuedrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats17numQueuedRequestsE", false]], "tensorrt_llm::executor::iterationstats::pinnedmemusage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats14pinnedMemUsageE", false]], "tensorrt_llm::executor::iterationstats::specdecstats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats12specDecStatsE", false]], "tensorrt_llm::executor::iterationstats::staticbatchingstats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats19staticBatchingStatsE", false]], "tensorrt_llm::executor::iterationstats::timestamp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14IterationStats9timestampE", false]], "tensorrt_llm::executor::iterationtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor13IterationTypeE", false]], "tensorrt_llm::executor::jsonserialization (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor17JsonSerializationE", false]], "tensorrt_llm::executor::jsonserialization::tojsonstr (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK12RequestStats", false], [0, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK14IterationStats", false], [0, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK24RequestStatsPerIteration", false]], "tensorrt_llm::executor::kv_cache (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cacheE", false]], "tensorrt_llm::executor::kv_cache::agentdesc (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDescE", false]], "tensorrt_llm::executor::kv_cache::agentdesc::agentdesc (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc9AgentDescENSt6stringE", false]], "tensorrt_llm::executor::kv_cache::agentdesc::getbackendagentdesc (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9AgentDesc19getBackendAgentDescEv", false]], "tensorrt_llm::executor::kv_cache::agentdesc::mbackendagentdesc (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc17mBackendAgentDescE", false]], "tensorrt_llm::executor::kv_cache::agentstate (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE", false]], "tensorrt_llm::executor::kv_cache::agentstate::agentstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateEv", false]], "tensorrt_llm::executor::kv_cache::agentstate::magentname (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10mAgentNameE", false]], "tensorrt_llm::executor::kv_cache::agentstate::mconnectioninfo (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState15mConnectionInfoE", false]], "tensorrt_llm::executor::kv_cache::agentstate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentStateeqERK10AgentState", false]], "tensorrt_llm::executor::kv_cache::agentstate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentState8toStringEv", false]], "tensorrt_llm::executor::kv_cache::baseagentconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfigE", false]], "tensorrt_llm::executor::kv_cache::baseagentconfig::mname (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig5mNameE", false]], "tensorrt_llm::executor::kv_cache::baseagentconfig::useprogthread (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig13useProgThreadE", false]], "tensorrt_llm::executor::kv_cache::basetransferagent (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentE", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::checkremotedescs (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::connectremoteagent (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::deregistermemory (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16deregisterMemoryERK13RegisterDescs", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::getconnectioninfo (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getConnectionInfoEv", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::getlocalagentdesc (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getLocalAgentDescEv", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::getnotifiedsyncmessages (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent23getNotifiedSyncMessagesEv", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::invalidateremoteagent (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent21invalidateRemoteAgentERKNSt6stringE", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::loadremoteagent (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::notifysyncmessage (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::registermemory (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent14registerMemoryERK13RegisterDescs", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::submittransferrequests (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent22submitTransferRequestsERK15TransferRequest", false]], "tensorrt_llm::executor::kv_cache::basetransferagent::~basetransferagent (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentD0Ev", false]], "tensorrt_llm::executor::kv_cache::cachestate (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheStateE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentionconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentionconfig::attentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig15AttentionConfigE13AttentionTypei", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentionconfig::mattentiontype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig14mAttentionTypeE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentionconfig::mkvfactor (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig9mKvFactorE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentiontype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionTypeE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentiontype::kdefault (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionType8kDEFAULTE", false]], "tensorrt_llm::executor::kv_cache::cachestate::attentiontype::kmla (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionType4kMLAE", false]], "tensorrt_llm::executor::kv_cache::cachestate::cachestate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", false]], "tensorrt_llm::executor::kv_cache::cachestate::getattentionconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState18getAttentionConfigEv", false]], "tensorrt_llm::executor::kv_cache::cachestate::getdatatype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11getDataTypeEv", false]], "tensorrt_llm::executor::kv_cache::cachestate::getmodelconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14getModelConfigEv", false]], "tensorrt_llm::executor::kv_cache::cachestate::getparallelconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState17getParallelConfigEv", false]], "tensorrt_llm::executor::kv_cache::cachestate::mattentionconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState16mAttentionConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::mdatatype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState9mDataTypeE", false]], "tensorrt_llm::executor::kv_cache::cachestate::mmodelconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState12mModelConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig::mnbkvheadsperlayer (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig18mNbKvHeadsPerLayerE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig::msizeperhead (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig12mSizePerHeadE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig::mtokensperblock (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig15mTokensPerBlockE", false]], "tensorrt_llm::executor::kv_cache::cachestate::modelconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigeqERK11ModelConfig", false]], "tensorrt_llm::executor::kv_cache::cachestate::mparallelconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15mParallelConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheStateeqERKN8kv_cache10CacheStateE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::mdprank (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig7mDPrankE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::mdpsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig7mDPsizeE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::menableattentiondp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig18mEnableAttentionDPE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::mpipelineparallelism (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig20mPipelineParallelismE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::mtensorparallelism (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig18mTensorParallelismE", false]], "tensorrt_llm::executor::kv_cache::cachestate::parallelconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigeqERK14ParallelConfig", false]], "tensorrt_llm::executor::kv_cache::cachestate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState8toStringEv", false]], "tensorrt_llm::executor::kv_cache::commstate (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommStateE", false]], "tensorrt_llm::executor::kv_cache::commstate::commstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::getagentstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13getAgentStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::getmpistate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::getselfidx (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv", false]], "tensorrt_llm::executor::kv_cache::commstate::getsocketstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState14getSocketStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::isagentstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState12isAgentStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::ismpistate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::issocketstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13isSocketStateEv", false]], "tensorrt_llm::executor::kv_cache::commstate::mselfidx (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState8mSelfIdxE", false]], "tensorrt_llm::executor::kv_cache::commstate::mstate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState6mStateE", false]], "tensorrt_llm::executor::kv_cache::commstate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommStateeqERK9CommState", false]], "tensorrt_llm::executor::kv_cache::commstate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState8toStringEv", false]], "tensorrt_llm::executor::kv_cache::connection (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10ConnectionE", false]], "tensorrt_llm::executor::kv_cache::connection::isthreadsafe (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection12isThreadSafeEv", false]], "tensorrt_llm::executor::kv_cache::connection::recv (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", false]], "tensorrt_llm::executor::kv_cache::connection::send (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", false]], "tensorrt_llm::executor::kv_cache::connection::~connection (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10ConnectionD0Ev", false]], "tensorrt_llm::executor::kv_cache::connectioninfotype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE", false]], "tensorrt_llm::executor::kv_cache::connectionmanager (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManagerE", false]], "tensorrt_llm::executor::kv_cache::connectionmanager::getcommstate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache17ConnectionManager12getCommStateEv", false]], "tensorrt_llm::executor::kv_cache::connectionmanager::getconnections (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager14getConnectionsERK9CommState", false]], "tensorrt_llm::executor::kv_cache::connectionmanager::recvconnect (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", false]], "tensorrt_llm::executor::kv_cache::connectionmanager::~connectionmanager (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManagerD0Ev", false]], "tensorrt_llm::executor::kv_cache::datacontext (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContextE", false]], "tensorrt_llm::executor::kv_cache::datacontext::datacontext (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext11DataContextEi", false]], "tensorrt_llm::executor::kv_cache::datacontext::gettag (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache11DataContext6getTagEv", false]], "tensorrt_llm::executor::kv_cache::datacontext::mtag (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext4mTagE", false]], "tensorrt_llm::executor::kv_cache::dynlibloader (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderE", false]], "tensorrt_llm::executor::kv_cache::dynlibloader::dlsym (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc", false]], "tensorrt_llm::executor::kv_cache::dynlibloader::dynlibloader (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderERK12DynLibLoader", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderEv", false]], "tensorrt_llm::executor::kv_cache::dynlibloader::getfunctionpointer (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE", false]], "tensorrt_llm::executor::kv_cache::dynlibloader::gethandle (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9getHandleERKNSt6stringE", false]], "tensorrt_llm::executor::kv_cache::dynlibloader::getinstance (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader11getInstanceEv", false]], "tensorrt_llm::executor::kv_cache::dynlibloader::mdllmutex (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mDllMutexE", false]], "tensorrt_llm::executor::kv_cache::dynlibloader::mhandlers (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mHandlersE", false]], "tensorrt_llm::executor::kv_cache::dynlibloader::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderaSERK12DynLibLoader", false]], "tensorrt_llm::executor::kv_cache::dynlibloader::~dynlibloader (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderD0Ev", false]], "tensorrt_llm::executor::kv_cache::maketransferagent (c++ function)": [[0, "_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args", false]], "tensorrt_llm::executor::kv_cache::memorydesc (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE", false]], "tensorrt_llm::executor::kv_cache::memorydesc::deserialize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc11deserializeERNSt7istreamE", false]], "tensorrt_llm::executor::kv_cache::memorydesc::getaddr (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc7getAddrEv", false]], "tensorrt_llm::executor::kv_cache::memorydesc::getdeviceid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc11getDeviceIdEv", false]], "tensorrt_llm::executor::kv_cache::memorydesc::getlen (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc6getLenEv", false]], "tensorrt_llm::executor::kv_cache::memorydesc::maddr (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc5mAddrE", false]], "tensorrt_llm::executor::kv_cache::memorydesc::mdeviceid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9mDeviceIdE", false]], "tensorrt_llm::executor::kv_cache::memorydesc::memorydesc (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t", false], [0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t", false]], "tensorrt_llm::executor::kv_cache::memorydesc::mlen (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc4mLenE", false]], "tensorrt_llm::executor::kv_cache::memorydesc::serialize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE", false]], "tensorrt_llm::executor::kv_cache::memorydesc::serializedsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc14serializedSizeERK10MemoryDesc", false]], "tensorrt_llm::executor::kv_cache::memorydescs (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescsE", false]], "tensorrt_llm::executor::kv_cache::memorydescs::getdescs (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs8getDescsEv", false]], "tensorrt_llm::executor::kv_cache::memorydescs::gettype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs7getTypeEv", false]], "tensorrt_llm::executor::kv_cache::memorydescs::mdescs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs6mDescsE", false]], "tensorrt_llm::executor::kv_cache::memorydescs::memorydescs (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE", false]], "tensorrt_llm::executor::kv_cache::memorydescs::mtype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs5mTypeE", false]], "tensorrt_llm::executor::kv_cache::memorytype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryTypeE", false]], "tensorrt_llm::executor::kv_cache::memorytype::kblk (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kBLKE", false]], "tensorrt_llm::executor::kv_cache::memorytype::kdram (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kDRAME", false]], "tensorrt_llm::executor::kv_cache::memorytype::kfile (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kFILEE", false]], "tensorrt_llm::executor::kv_cache::memorytype::kobj (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kOBJE", false]], "tensorrt_llm::executor::kv_cache::memorytype::kvram (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kVRAME", false]], "tensorrt_llm::executor::kv_cache::mpistate (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache8MpiStateE", false]], "tensorrt_llm::executor::kv_cache::mpistate::mranks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache8MpiState6mRanksE", false]], "tensorrt_llm::executor::kv_cache::mpistate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiStateeqERK8MpiState", false]], "tensorrt_llm::executor::kv_cache::mpistate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiState8toStringEv", false]], "tensorrt_llm::executor::kv_cache::registerdescs (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache13RegisterDescsE", false]], "tensorrt_llm::executor::kv_cache::socketstate (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketStateE", false]], "tensorrt_llm::executor::kv_cache::socketstate::mip (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketState3mIpE", false]], "tensorrt_llm::executor::kv_cache::socketstate::mport (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketState5mPortE", false]], "tensorrt_llm::executor::kv_cache::socketstate::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketStateeqERK11SocketState", false]], "tensorrt_llm::executor::kv_cache::socketstate::tostring (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketState8toStringEv", false]], "tensorrt_llm::executor::kv_cache::syncmessage (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE", false]], "tensorrt_llm::executor::kv_cache::transferdescs (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE", false]], "tensorrt_llm::executor::kv_cache::transferop (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOpE", false]], "tensorrt_llm::executor::kv_cache::transferop::kread (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp5kREADE", false]], "tensorrt_llm::executor::kv_cache::transferop::kwrite (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp6kWRITEE", false]], "tensorrt_llm::executor::kv_cache::transferrequest (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequestE", false]], "tensorrt_llm::executor::kv_cache::transferrequest::getdstdescs (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getDstDescsEv", false]], "tensorrt_llm::executor::kv_cache::transferrequest::getop (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest5getOpEv", false]], "tensorrt_llm::executor::kv_cache::transferrequest::getremotename (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest13getRemoteNameEv", false]], "tensorrt_llm::executor::kv_cache::transferrequest::getsrcdescs (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getSrcDescsEv", false]], "tensorrt_llm::executor::kv_cache::transferrequest::getsyncmessage (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest14getSyncMessageEv", false]], "tensorrt_llm::executor::kv_cache::transferrequest::mdstdescs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mDstDescsE", false]], "tensorrt_llm::executor::kv_cache::transferrequest::mop (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest3mOpE", false]], "tensorrt_llm::executor::kv_cache::transferrequest::mremotename (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest11mRemoteNameE", false]], "tensorrt_llm::executor::kv_cache::transferrequest::msrcdescs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mSrcDescsE", false]], "tensorrt_llm::executor::kv_cache::transferrequest::msyncmessage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest12mSyncMessageE", false]], "tensorrt_llm::executor::kv_cache::transferrequest::transferrequest (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE", false]], "tensorrt_llm::executor::kv_cache::transferstatus (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusE", false]], "tensorrt_llm::executor::kv_cache::transferstatus::iscompleted (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus11isCompletedEv", false]], "tensorrt_llm::executor::kv_cache::transferstatus::wait (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus4waitEv", false]], "tensorrt_llm::executor::kv_cache::transferstatus::~transferstatus (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusD0Ev", false]], "tensorrt_llm::executor::kvcacheconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfigE", false]], "tensorrt_llm::executor::kvcacheconfig::fillemptyfieldsfromruntimedefaults (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig34fillEmptyFieldsFromRuntimeDefaultsEN12tensorrt_llm7runtime15RuntimeDefaultsE", false]], "tensorrt_llm::executor::kvcacheconfig::getcopyonpartialreuse (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getCopyOnPartialReuseEv", false]], "tensorrt_llm::executor::kvcacheconfig::getcrosskvcachefraction (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig23getCrossKvCacheFractionEv", false]], "tensorrt_llm::executor::kvcacheconfig::getenableblockreuse (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig19getEnableBlockReuseEv", false]], "tensorrt_llm::executor::kvcacheconfig::getenablepartialreuse (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getEnablePartialReuseEv", false]], "tensorrt_llm::executor::kvcacheconfig::geteventbuffermaxsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getEventBufferMaxSizeEv", false]], "tensorrt_llm::executor::kvcacheconfig::getfreegpumemoryfraction (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig24getFreeGpuMemoryFractionEv", false]], "tensorrt_llm::executor::kvcacheconfig::gethostcachesize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig16getHostCacheSizeEv", false]], "tensorrt_llm::executor::kvcacheconfig::getmaxattentionwindowvec (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig24getMaxAttentionWindowVecEv", false]], "tensorrt_llm::executor::kvcacheconfig::getmaxtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig12getMaxTokensEv", false]], "tensorrt_llm::executor::kvcacheconfig::getonboardblocks (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig16getOnboardBlocksEv", false]], "tensorrt_llm::executor::kvcacheconfig::getsecondaryoffloadminpriority (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig30getSecondaryOffloadMinPriorityEv", false]], "tensorrt_llm::executor::kvcacheconfig::getsinktokenlength (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig18getSinkTokenLengthEv", false]], "tensorrt_llm::executor::kvcacheconfig::kvcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", false]], "tensorrt_llm::executor::kvcacheconfig::mcopyonpartialreuse (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mCopyOnPartialReuseE", false]], "tensorrt_llm::executor::kvcacheconfig::mcrosskvcachefraction (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21mCrossKvCacheFractionE", false]], "tensorrt_llm::executor::kvcacheconfig::menableblockreuse (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig17mEnableBlockReuseE", false]], "tensorrt_llm::executor::kvcacheconfig::menablepartialreuse (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mEnablePartialReuseE", false]], "tensorrt_llm::executor::kvcacheconfig::meventbuffermaxsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mEventBufferMaxSizeE", false]], "tensorrt_llm::executor::kvcacheconfig::mfreegpumemoryfraction (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig22mFreeGpuMemoryFractionE", false]], "tensorrt_llm::executor::kvcacheconfig::mhostcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig14mHostCacheSizeE", false]], "tensorrt_llm::executor::kvcacheconfig::mmaxattentionwindowvec (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig22mMaxAttentionWindowVecE", false]], "tensorrt_llm::executor::kvcacheconfig::mmaxtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig10mMaxTokensE", false]], "tensorrt_llm::executor::kvcacheconfig::monboardblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig14mOnboardBlocksE", false]], "tensorrt_llm::executor::kvcacheconfig::msecondaryoffloadminpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig28mSecondaryOffloadMinPriorityE", false]], "tensorrt_llm::executor::kvcacheconfig::msinktokenlength (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16mSinkTokenLengthE", false]], "tensorrt_llm::executor::kvcacheconfig::setcopyonpartialreuse (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setCopyOnPartialReuseEb", false]], "tensorrt_llm::executor::kvcacheconfig::setcrosskvcachefraction (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig23setCrossKvCacheFractionE9FloatType", false]], "tensorrt_llm::executor::kvcacheconfig::setenableblockreuse (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19setEnableBlockReuseEb", false]], "tensorrt_llm::executor::kvcacheconfig::setenablepartialreuse (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEnablePartialReuseEb", false]], "tensorrt_llm::executor::kvcacheconfig::seteventbuffermaxsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEventBufferMaxSizeE6size_t", false]], "tensorrt_llm::executor::kvcacheconfig::setfreegpumemoryfraction (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setFreeGpuMemoryFractionE9FloatType", false]], "tensorrt_llm::executor::kvcacheconfig::sethostcachesize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setHostCacheSizeE6size_t", false]], "tensorrt_llm::executor::kvcacheconfig::setmaxattentionwindowvec (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setMaxAttentionWindowVecENSt6vectorI10SizeType32EE", false]], "tensorrt_llm::executor::kvcacheconfig::setmaxtokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig12setMaxTokensE10SizeType32", false]], "tensorrt_llm::executor::kvcacheconfig::setonboardblocks (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setOnboardBlocksEb", false]], "tensorrt_llm::executor::kvcacheconfig::setsecondaryoffloadminpriority (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig30setSecondaryOffloadMinPriorityENSt8optionalI17RetentionPriorityEE", false]], "tensorrt_llm::executor::kvcacheconfig::setsinktokenlength (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig18setSinkTokenLengthE10SizeType32", false]], "tensorrt_llm::executor::kvcachecreateddata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheCreatedDataE", false]], "tensorrt_llm::executor::kvcachecreateddata::numblockspercachelevel (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheCreatedData22numBlocksPerCacheLevelE", false]], "tensorrt_llm::executor::kvcacheevent (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor12KVCacheEventE", false]], "tensorrt_llm::executor::kvcacheevent::data (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent4dataE", false]], "tensorrt_llm::executor::kvcacheevent::eventid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent7eventIdE", false]], "tensorrt_llm::executor::kvcacheevent::kvcacheevent (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent12KVCacheEventE6IdType16KVCacheEventData", false]], "tensorrt_llm::executor::kvcacheeventdata (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDataE", false]], "tensorrt_llm::executor::kvcacheeventdiff (c++ struct)": [[0, "_CPPv4I0EN12tensorrt_llm8executor16KVCacheEventDiffE", false]], "tensorrt_llm::executor::kvcacheeventdiff::newvalue (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDiff8newValueE", false]], "tensorrt_llm::executor::kvcacheeventdiff::oldvalue (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDiff8oldValueE", false]], "tensorrt_llm::executor::kvcacheeventmanager (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManagerE", false]], "tensorrt_llm::executor::kvcacheeventmanager::getlatestevents (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager15getLatestEventsENSt8optionalINSt6chrono12millisecondsEEE", false]], "tensorrt_llm::executor::kvcacheeventmanager::kvcacheeventmanager (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager19KVCacheEventManagerENSt10shared_ptrIN12tensorrt_llm13batch_manager16kv_cache_manager18BaseKVCacheManagerEEE", false]], "tensorrt_llm::executor::kvcacheeventmanager::kvcachemanager (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager14kvCacheManagerE", false]], "tensorrt_llm::executor::kvcacheremoveddata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheRemovedDataE", false]], "tensorrt_llm::executor::kvcacheremoveddata::blockhashes (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheRemovedData11blockHashesE", false]], "tensorrt_llm::executor::kvcacheretentionconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfigE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::getdecodedurationms (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig19getDecodeDurationMsEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::getdecoderetentionpriority (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig26getDecodeRetentionPriorityEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::getdirectory (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig12getDirectoryEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::getperblockretentionpriorityduration (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32", false]], "tensorrt_llm::executor::kvcacheretentionconfig::gettokenrangeretentionconfigs (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig29getTokenRangeRetentionConfigsEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::gettransfermode (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig15getTransferModeEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::kdefaultretentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25kDefaultRetentionPriorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::kmaxretentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMaxRetentionPriorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::kminretentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMinRetentionPriorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::kvcacheretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE", false], [0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigEv", false]], "tensorrt_llm::executor::kvcacheretentionconfig::mdecodedurationms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig17mDecodeDurationMsE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::mdecoderetentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig24mDecodeRetentionPriorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::mdirectory (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig10mDirectoryE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::mtokenrangeretentionconfigs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig27mTokenRangeRetentionConfigsE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::mtransfermode (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig13mTransferModeE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfigeqERK22KvCacheRetentionConfig", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::durationms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig10durationMsE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigeqERK25TokenRangeRetentionConfig", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::priority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig8priorityE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::tokenend (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig8tokenEndE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::tokenrangeretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", false]], "tensorrt_llm::executor::kvcacheretentionconfig::tokenrangeretentionconfig::tokenstart (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig10tokenStartE", false]], "tensorrt_llm::executor::kvcachestats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStatsE", false]], "tensorrt_llm::executor::kvcachestats::allocnewblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats14allocNewBlocksE", false]], "tensorrt_llm::executor::kvcachestats::alloctotalblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats16allocTotalBlocksE", false]], "tensorrt_llm::executor::kvcachestats::cachehitrate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12cacheHitRateE", false]], "tensorrt_llm::executor::kvcachestats::freenumblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats13freeNumBlocksE", false]], "tensorrt_llm::executor::kvcachestats::maxnumblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12maxNumBlocksE", false]], "tensorrt_llm::executor::kvcachestats::missedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12missedBlocksE", false]], "tensorrt_llm::executor::kvcachestats::reusedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12reusedBlocksE", false]], "tensorrt_llm::executor::kvcachestats::tokensperblock (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats14tokensPerBlockE", false]], "tensorrt_llm::executor::kvcachestats::usednumblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12KvCacheStats13usedNumBlocksE", false]], "tensorrt_llm::executor::kvcachestoredblockdata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockDataE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::blockhash (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData9blockHashE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::cachelevel (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData10cacheLevelE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::kvcachestoredblockdata (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", false]], "tensorrt_llm::executor::kvcachestoredblockdata::loraid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData6loraIdE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::priority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData8priorityE", false]], "tensorrt_llm::executor::kvcachestoredblockdata::tokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData6tokensE", false]], "tensorrt_llm::executor::kvcachestoreddata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredDataE", false]], "tensorrt_llm::executor::kvcachestoreddata::blocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredData6blocksE", false]], "tensorrt_llm::executor::kvcachestoreddata::parenthash (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredData10parentHashE", false]], "tensorrt_llm::executor::kvcachetransfermode (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE", false]], "tensorrt_llm::executor::kvcachetransfermode::dram (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode4DRAME", false]], "tensorrt_llm::executor::kvcachetransfermode::gds (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode3GDSE", false]], "tensorrt_llm::executor::kvcachetransfermode::posix_debug_fallback (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode20POSIX_DEBUG_FALLBACKE", false]], "tensorrt_llm::executor::kvcacheupdateddata (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedDataE", false]], "tensorrt_llm::executor::kvcacheupdateddata::blockhash (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData9blockHashE", false]], "tensorrt_llm::executor::kvcacheupdateddata::cachelevel (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData10cacheLevelE", false]], "tensorrt_llm::executor::kvcacheupdateddata::cachelevelupdated (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData17cacheLevelUpdatedE10SizeType3210SizeType32", false]], "tensorrt_llm::executor::kvcacheupdateddata::kvcacheupdateddata (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData18KVCacheUpdatedDataE6IdType", false]], "tensorrt_llm::executor::kvcacheupdateddata::priority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData8priorityE", false]], "tensorrt_llm::executor::kvcacheupdateddata::priorityupdated (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData15priorityUpdatedE10SizeType3210SizeType32", false]], "tensorrt_llm::executor::logitspostprocessor (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor19LogitsPostProcessorE", false]], "tensorrt_llm::executor::logitspostprocessorbatched (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor26LogitsPostProcessorBatchedE", false]], "tensorrt_llm::executor::logitspostprocessorconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfigE", false]], "tensorrt_llm::executor::logitspostprocessorconfig::getprocessorbatched (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig19getProcessorBatchedEv", false]], "tensorrt_llm::executor::logitspostprocessorconfig::getprocessormap (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig15getProcessorMapEv", false]], "tensorrt_llm::executor::logitspostprocessorconfig::getreplicate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig12getReplicateEv", false]], "tensorrt_llm::executor::logitspostprocessorconfig::logitspostprocessorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", false]], "tensorrt_llm::executor::logitspostprocessorconfig::mprocessorbatched (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig17mProcessorBatchedE", false]], "tensorrt_llm::executor::logitspostprocessorconfig::mprocessormap (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig13mProcessorMapE", false]], "tensorrt_llm::executor::logitspostprocessorconfig::mreplicate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig10mReplicateE", false]], "tensorrt_llm::executor::logitspostprocessorconfig::setprocessorbatched (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig19setProcessorBatchedERK26LogitsPostProcessorBatched", false]], "tensorrt_llm::executor::logitspostprocessorconfig::setprocessormap (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig15setProcessorMapERK22LogitsPostProcessorMap", false]], "tensorrt_llm::executor::logitspostprocessorconfig::setreplicate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig12setReplicateEb", false]], "tensorrt_llm::executor::logitspostprocessormap (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor22LogitsPostProcessorMapE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfigE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::calculatespeculativeresource (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig28calculateSpeculativeResourceEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::calculatespeculativeresourcetuple (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::get (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig3getEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::getngramsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig12getNgramSizeEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::getverificationsetsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig22getVerificationSetSizeEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::getwindowsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig13getWindowSizeEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::isle (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig4isLEERK23LookaheadDecodingConfig", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::islegal (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::kdefaultlookaheaddecodingngram (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig30kDefaultLookaheadDecodingNgramE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::kdefaultlookaheaddecodingverificationset (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig40kDefaultLookaheadDecodingVerificationSetE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::kdefaultlookaheaddecodingwindow (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig31kDefaultLookaheadDecodingWindowE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::lookaheaddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", false], [0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigEv", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::mngramsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig10mNgramSizeE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::mverificationsetsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig20mVerificationSetSizeE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::mwindowsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig11mWindowSizeE", false]], "tensorrt_llm::executor::lookaheaddecodingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfigeqERK23LookaheadDecodingConfig", false]], "tensorrt_llm::executor::loraconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfigE", false]], "tensorrt_llm::executor::loraconfig::getconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor10LoraConfig9getConfigEv", false]], "tensorrt_llm::executor::loraconfig::gettaskid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor10LoraConfig9getTaskIdEv", false]], "tensorrt_llm::executor::loraconfig::getweights (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor10LoraConfig10getWeightsEv", false]], "tensorrt_llm::executor::loraconfig::loraconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", false]], "tensorrt_llm::executor::loraconfig::mconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfig7mConfigE", false]], "tensorrt_llm::executor::loraconfig::mtaskid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfig7mTaskIdE", false]], "tensorrt_llm::executor::loraconfig::mweights (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10LoraConfig8mWeightsE", false]], "tensorrt_llm::executor::medusachoices (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor13MedusaChoicesE", false]], "tensorrt_llm::executor::memorytype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryTypeE", false]], "tensorrt_llm::executor::memorytype::kcpu (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType4kCPUE", false]], "tensorrt_llm::executor::memorytype::kcpu_pinned (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType11kCPU_PINNEDE", false]], "tensorrt_llm::executor::memorytype::kcpu_pinnedpool (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType15kCPU_PINNEDPOOLE", false]], "tensorrt_llm::executor::memorytype::kgpu (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType4kGPUE", false]], "tensorrt_llm::executor::memorytype::kunknown (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType8kUNKNOWNE", false]], "tensorrt_llm::executor::memorytype::kuvm (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor10MemoryType4kUVME", false]], "tensorrt_llm::executor::millisecondstype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor16MillisecondsTypeE", false]], "tensorrt_llm::executor::modeltype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor9ModelTypeE", false]], "tensorrt_llm::executor::modeltype::kdecoder_only (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor9ModelType13kDECODER_ONLYE", false]], "tensorrt_llm::executor::modeltype::kencoder_decoder (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor9ModelType16kENCODER_DECODERE", false]], "tensorrt_llm::executor::modeltype::kencoder_only (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor9ModelType13kENCODER_ONLYE", false]], "tensorrt_llm::executor::mropeconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor11MropeConfigE", false]], "tensorrt_llm::executor::mropeconfig::getmropepositiondeltas (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11MropeConfig22getMRopePositionDeltasEv", false]], "tensorrt_llm::executor::mropeconfig::getmroperotarycossin (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor11MropeConfig20getMRopeRotaryCosSinEv", false]], "tensorrt_llm::executor::mropeconfig::mmropepositiondeltas (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11MropeConfig20mMRopePositionDeltasE", false]], "tensorrt_llm::executor::mropeconfig::mmroperotarycossin (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor11MropeConfig18mMRopeRotaryCosSinE", false]], "tensorrt_llm::executor::mropeconfig::mropeconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor11MropeConfig11MropeConfigE6Tensor10SizeType32", false]], "tensorrt_llm::executor::operator<< (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE21ContextChunkingPolicy", false], [0, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE23CapacitySchedulerPolicy", false]], "tensorrt_llm::executor::orchestratorconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfigE", false]], "tensorrt_llm::executor::orchestratorconfig::getisorchestrator (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getIsOrchestratorEv", false]], "tensorrt_llm::executor::orchestratorconfig::getorchleadercomm (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getOrchLeaderCommEv", false]], "tensorrt_llm::executor::orchestratorconfig::getspawnprocesses (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getSpawnProcessesEv", false]], "tensorrt_llm::executor::orchestratorconfig::getworkerexecutablepath (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig23getWorkerExecutablePathEv", false]], "tensorrt_llm::executor::orchestratorconfig::misorchestrator (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mIsOrchestratorE", false]], "tensorrt_llm::executor::orchestratorconfig::morchleadercomm (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mOrchLeaderCommE", false]], "tensorrt_llm::executor::orchestratorconfig::mspawnprocesses (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mSpawnProcessesE", false]], "tensorrt_llm::executor::orchestratorconfig::mworkerexecutablepath (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig21mWorkerExecutablePathE", false]], "tensorrt_llm::executor::orchestratorconfig::orchestratorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", false]], "tensorrt_llm::executor::orchestratorconfig::setisorchestrator (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setIsOrchestratorEb", false]], "tensorrt_llm::executor::orchestratorconfig::setorchleadercomm (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setOrchLeaderCommERKNSt10shared_ptrIN3mpi7MpiCommEEE", false]], "tensorrt_llm::executor::orchestratorconfig::setspawnprocesses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setSpawnProcessesEb", false]], "tensorrt_llm::executor::orchestratorconfig::setworkerexecutablepath (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig23setWorkerExecutablePathERKNSt6stringE", false]], "tensorrt_llm::executor::outputconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfigE", false]], "tensorrt_llm::executor::outputconfig::additionalmodeloutputs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig22additionalModelOutputsE", false]], "tensorrt_llm::executor::outputconfig::excludeinputfromoutput (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig22excludeInputFromOutputE", false]], "tensorrt_llm::executor::outputconfig::outputconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", false]], "tensorrt_llm::executor::outputconfig::returncontextlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig19returnContextLogitsE", false]], "tensorrt_llm::executor::outputconfig::returnencoderoutput (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig19returnEncoderOutputE", false]], "tensorrt_llm::executor::outputconfig::returngenerationlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig22returnGenerationLogitsE", false]], "tensorrt_llm::executor::outputconfig::returnlogprobs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig14returnLogProbsE", false]], "tensorrt_llm::executor::outputconfig::returnperfmetrics (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12OutputConfig17returnPerfMetricsE", false]], "tensorrt_llm::executor::parallelconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfigE", false]], "tensorrt_llm::executor::parallelconfig::getcommunicationmode (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig20getCommunicationModeEv", false]], "tensorrt_llm::executor::parallelconfig::getcommunicationtype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig20getCommunicationTypeEv", false]], "tensorrt_llm::executor::parallelconfig::getdeviceids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig12getDeviceIdsEv", false]], "tensorrt_llm::executor::parallelconfig::getnumnodes (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig11getNumNodesEv", false]], "tensorrt_llm::executor::parallelconfig::getorchestratorconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig21getOrchestratorConfigEv", false]], "tensorrt_llm::executor::parallelconfig::getparticipantids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig17getParticipantIdsEv", false]], "tensorrt_llm::executor::parallelconfig::mcommmode (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mCommModeE", false]], "tensorrt_llm::executor::parallelconfig::mcommtype (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mCommTypeE", false]], "tensorrt_llm::executor::parallelconfig::mdeviceids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig10mDeviceIdsE", false]], "tensorrt_llm::executor::parallelconfig::mnumnodes (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mNumNodesE", false]], "tensorrt_llm::executor::parallelconfig::morchestratorconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig19mOrchestratorConfigE", false]], "tensorrt_llm::executor::parallelconfig::mparticipantids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig15mParticipantIdsE", false]], "tensorrt_llm::executor::parallelconfig::parallelconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::parallelconfig::setcommunicationmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationModeE17CommunicationMode", false]], "tensorrt_llm::executor::parallelconfig::setcommunicationtype (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationTypeE17CommunicationType", false]], "tensorrt_llm::executor::parallelconfig::setdeviceids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig12setDeviceIdsERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::executor::parallelconfig::setnumnodes (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig11setNumNodesE10SizeType32", false]], "tensorrt_llm::executor::parallelconfig::setorchestratorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig21setOrchestratorConfigERK18OrchestratorConfig", false]], "tensorrt_llm::executor::parallelconfig::setparticipantids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14ParallelConfig17setParticipantIdsERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::executor::peftcacheconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfigE", false]], "tensorrt_llm::executor::peftcacheconfig::getdevicecachepercent (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getDeviceCachePercentEv", false]], "tensorrt_llm::executor::peftcacheconfig::gethostcachesize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig16getHostCacheSizeEv", false]], "tensorrt_llm::executor::peftcacheconfig::getloraprefetchdir (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig18getLoraPrefetchDirEv", false]], "tensorrt_llm::executor::peftcacheconfig::getmaxadaptersize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig17getMaxAdapterSizeEv", false]], "tensorrt_llm::executor::peftcacheconfig::getmaxpagesperblockdevice (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig25getMaxPagesPerBlockDeviceEv", false]], "tensorrt_llm::executor::peftcacheconfig::getmaxpagesperblockhost (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig23getMaxPagesPerBlockHostEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumcopystreams (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig17getNumCopyStreamsEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumdevicemodulelayer (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig23getNumDeviceModuleLayerEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumensureworkers (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig19getNumEnsureWorkersEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumhostmodulelayer (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getNumHostModuleLayerEv", false]], "tensorrt_llm::executor::peftcacheconfig::getnumputworkers (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig16getNumPutWorkersEv", false]], "tensorrt_llm::executor::peftcacheconfig::getoptimaladaptersize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getOptimalAdapterSizeEv", false]], "tensorrt_llm::executor::peftcacheconfig::kdefaultmaxadaptersize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig22kDefaultMaxAdapterSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::kdefaultmaxpagesperblockdevice (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig30kDefaultMaxPagesPerBlockDeviceE", false]], "tensorrt_llm::executor::peftcacheconfig::kdefaultmaxpagesperblockhost (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig28kDefaultMaxPagesPerBlockHostE", false]], "tensorrt_llm::executor::peftcacheconfig::kdefaultoptimaladaptersize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig26kDefaultOptimalAdapterSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::mdevicecachepercent (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mDeviceCachePercentE", false]], "tensorrt_llm::executor::peftcacheconfig::mhostcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig14mHostCacheSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::mloraprefetchdir (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig16mLoraPrefetchDirE", false]], "tensorrt_llm::executor::peftcacheconfig::mmaxadaptersize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15mMaxAdapterSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::mmaxpagesperblockdevice (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig23mMaxPagesPerBlockDeviceE", false]], "tensorrt_llm::executor::peftcacheconfig::mmaxpagesperblockhost (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig21mMaxPagesPerBlockHostE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumcopystreams (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15mNumCopyStreamsE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumdevicemodulelayer (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig21mNumDeviceModuleLayerE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumensureworkers (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig17mNumEnsureWorkersE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumhostmodulelayer (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mNumHostModuleLayerE", false]], "tensorrt_llm::executor::peftcacheconfig::mnumputworkers (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig14mNumPutWorkersE", false]], "tensorrt_llm::executor::peftcacheconfig::moptimaladaptersize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mOptimalAdapterSizeE", false]], "tensorrt_llm::executor::peftcacheconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfigeqERK15PeftCacheConfig", false]], "tensorrt_llm::executor::peftcacheconfig::peftcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", false]], "tensorrt_llm::executor::prioritytype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor12PriorityTypeE", false]], "tensorrt_llm::executor::prompttuningconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfigE", false]], "tensorrt_llm::executor::prompttuningconfig::getembeddingtable (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18PromptTuningConfig17getEmbeddingTableEv", false]], "tensorrt_llm::executor::prompttuningconfig::getinputtokenextraids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor18PromptTuningConfig21getInputTokenExtraIdsEv", false]], "tensorrt_llm::executor::prompttuningconfig::membeddingtable (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig15mEmbeddingTableE", false]], "tensorrt_llm::executor::prompttuningconfig::minputtokenextraids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig19mInputTokenExtraIdsE", false]], "tensorrt_llm::executor::prompttuningconfig::prompttuningconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig18PromptTuningConfigE6TensorNSt8optionalI16VecTokenExtraIdsEE", false]], "tensorrt_llm::executor::randomseedtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor14RandomSeedTypeE", false]], "tensorrt_llm::executor::request (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor7RequestE", false]], "tensorrt_llm::executor::request::getadditionaloutputnames (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request24getAdditionalOutputNamesEv", false]], "tensorrt_llm::executor::request::getallottedtimems (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request17getAllottedTimeMsEv", false]], "tensorrt_llm::executor::request::getbadwords (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request11getBadWordsEv", false]], "tensorrt_llm::executor::request::getclientid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request11getClientIdEv", false]], "tensorrt_llm::executor::request::getcontextphaseparams (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request21getContextPhaseParamsEv", false]], "tensorrt_llm::executor::request::getcrossattentionmask (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request21getCrossAttentionMaskEv", false]], "tensorrt_llm::executor::request::geteagleconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request14getEagleConfigEv", false]], "tensorrt_llm::executor::request::getembeddingbias (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request16getEmbeddingBiasEv", false]], "tensorrt_llm::executor::request::getencoderinputfeatures (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request23getEncoderInputFeaturesEv", false]], "tensorrt_llm::executor::request::getencoderinputtokenids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request23getEncoderInputTokenIdsEv", false]], "tensorrt_llm::executor::request::getencoderoutputlength (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request22getEncoderOutputLengthEv", false]], "tensorrt_llm::executor::request::getendid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request8getEndIdEv", false]], "tensorrt_llm::executor::request::getexternaldrafttokensconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request28getExternalDraftTokensConfigEv", false]], "tensorrt_llm::executor::request::getguideddecodingparams (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request23getGuidedDecodingParamsEv", false]], "tensorrt_llm::executor::request::getinputtokenids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request16getInputTokenIdsEv", false]], "tensorrt_llm::executor::request::getkvcacheretentionconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request25getKvCacheRetentionConfigEv", false]], "tensorrt_llm::executor::request::getlanguageadapteruid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request21getLanguageAdapterUidEv", false]], "tensorrt_llm::executor::request::getlogitspostprocessor (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request22getLogitsPostProcessorEv", false]], "tensorrt_llm::executor::request::getlogitspostprocessorname (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request26getLogitsPostProcessorNameEv", false]], "tensorrt_llm::executor::request::getlookaheadconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request18getLookaheadConfigEv", false]], "tensorrt_llm::executor::request::getloraconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request13getLoraConfigEv", false]], "tensorrt_llm::executor::request::getmaxtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request12getMaxTokensEv", false]], "tensorrt_llm::executor::request::getmropeconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request14getMropeConfigEv", false]], "tensorrt_llm::executor::request::getmultimodalembedding (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request22getMultimodalEmbeddingEv", false]], "tensorrt_llm::executor::request::getoutputconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request15getOutputConfigEv", false]], "tensorrt_llm::executor::request::getpadid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request8getPadIdEv", false]], "tensorrt_llm::executor::request::getpositionids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request14getPositionIdsEv", false]], "tensorrt_llm::executor::request::getpriority (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request11getPriorityEv", false]], "tensorrt_llm::executor::request::getprompttuningconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request21getPromptTuningConfigEv", false]], "tensorrt_llm::executor::request::getrequesttype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request14getRequestTypeEv", false]], "tensorrt_llm::executor::request::getreturnallgeneratedtokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request27getReturnAllGeneratedTokensEv", false]], "tensorrt_llm::executor::request::getsamplingconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request17getSamplingConfigEv", false]], "tensorrt_llm::executor::request::getskipcrossattnblocks (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request22getSkipCrossAttnBlocksEv", false]], "tensorrt_llm::executor::request::getstopwords (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request12getStopWordsEv", false]], "tensorrt_llm::executor::request::getstreaming (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor7Request12getStreamingEv", false]], "tensorrt_llm::executor::request::kbatchedpostprocessorname (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor7Request25kBatchedPostProcessorNameE", false]], "tensorrt_llm::executor::request::kdefaultpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor7Request16kDefaultPriorityE", false]], "tensorrt_llm::executor::request::kdynamicpostprocessornameprefix (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor7Request31kDynamicPostProcessorNamePrefixE", false]], "tensorrt_llm::executor::request::mimpl (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor7Request5mImplE", false]], "tensorrt_llm::executor::request::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7RequestaSERK7Request", false], [0, "_CPPv4N12tensorrt_llm8executor7RequestaSERR7Request", false]], "tensorrt_llm::executor::request::request (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", false], [0, "_CPPv4N12tensorrt_llm8executor7Request7RequestERK7Request", false], [0, "_CPPv4N12tensorrt_llm8executor7Request7RequestERR7Request", false]], "tensorrt_llm::executor::request::setallottedtimems (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request17setAllottedTimeMsE16MillisecondsType", false]], "tensorrt_llm::executor::request::setbadwords (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request11setBadWordsERKNSt4listI9VecTokensEE", false]], "tensorrt_llm::executor::request::setclientid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request11setClientIdE6IdType", false]], "tensorrt_llm::executor::request::setcontextphaseparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request21setContextPhaseParamsE18ContextPhaseParams", false]], "tensorrt_llm::executor::request::setcrossattentionmask (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request21setCrossAttentionMaskE6Tensor", false]], "tensorrt_llm::executor::request::seteagleconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request14setEagleConfigERKNSt8optionalI11EagleConfigEE", false]], "tensorrt_llm::executor::request::setembeddingbias (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request16setEmbeddingBiasERK6Tensor", false]], "tensorrt_llm::executor::request::setencoderinputfeatures (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputFeaturesE6Tensor", false]], "tensorrt_llm::executor::request::setencoderinputtokenids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputTokenIdsERK9VecTokens", false]], "tensorrt_llm::executor::request::setencoderoutputlength (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request22setEncoderOutputLengthE10SizeType32", false]], "tensorrt_llm::executor::request::setendid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request8setEndIdE10SizeType32", false]], "tensorrt_llm::executor::request::setexternaldrafttokensconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request28setExternalDraftTokensConfigERK25ExternalDraftTokensConfig", false]], "tensorrt_llm::executor::request::setguideddecodingparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request23setGuidedDecodingParamsERK20GuidedDecodingParams", false]], "tensorrt_llm::executor::request::setkvcacheretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request25setKvCacheRetentionConfigERK22KvCacheRetentionConfig", false]], "tensorrt_llm::executor::request::setlanguageadapteruid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request21setLanguageAdapterUidE10SizeType32", false]], "tensorrt_llm::executor::request::setlogitspostprocessor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request22setLogitsPostProcessorERKNSt8optionalI19LogitsPostProcessorEE", false]], "tensorrt_llm::executor::request::setlogitspostprocessorname (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request26setLogitsPostProcessorNameERKNSt6stringE", false]], "tensorrt_llm::executor::request::setlookaheadconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request18setLookaheadConfigERK23LookaheadDecodingConfig", false]], "tensorrt_llm::executor::request::setloraconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request13setLoraConfigERK10LoraConfig", false]], "tensorrt_llm::executor::request::setmropeconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request14setMropeConfigERK11MropeConfig", false]], "tensorrt_llm::executor::request::setmultimodalembedding (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request22setMultimodalEmbeddingERK6Tensor", false]], "tensorrt_llm::executor::request::setoutputconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request15setOutputConfigERK12OutputConfig", false]], "tensorrt_llm::executor::request::setpadid (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request8setPadIdE10SizeType32", false]], "tensorrt_llm::executor::request::setpositionids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request14setPositionIdsERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::executor::request::setpriority (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request11setPriorityE12PriorityType", false]], "tensorrt_llm::executor::request::setprompttuningconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request21setPromptTuningConfigERK18PromptTuningConfig", false]], "tensorrt_llm::executor::request::setrequesttype (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request14setRequestTypeERK11RequestType", false]], "tensorrt_llm::executor::request::setreturnallgeneratedtokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request27setReturnAllGeneratedTokensEb", false]], "tensorrt_llm::executor::request::setsamplingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request17setSamplingConfigERK14SamplingConfig", false]], "tensorrt_llm::executor::request::setskipcrossattnblocks (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request22setSkipCrossAttnBlocksE6Tensor", false]], "tensorrt_llm::executor::request::setstopwords (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request12setStopWordsERKNSt4listI9VecTokensEE", false]], "tensorrt_llm::executor::request::setstreaming (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7Request12setStreamingEb", false]], "tensorrt_llm::executor::request::~request (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7RequestD0Ev", false]], "tensorrt_llm::executor::requestperfmetrics (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::firstiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics9firstIterE", false]], "tensorrt_llm::executor::requestperfmetrics::iter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics4iterE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14kvCacheMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::kvcachehitrate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics14kvCacheHitRateE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::nummissedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics15numMissedBlocksE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::numnewallocatedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics21numNewAllocatedBlocksE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::numreusedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics15numReusedBlocksE", false]], "tensorrt_llm::executor::requestperfmetrics::kvcachemetrics::numtotalallocatedblocks (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics23numTotalAllocatedBlocksE", false]], "tensorrt_llm::executor::requestperfmetrics::lastiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics8lastIterE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecoding (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics19speculativeDecodingE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecodingmetrics (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecodingmetrics::acceptancerate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics14acceptanceRateE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecodingmetrics::totalaccepteddrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics24totalAcceptedDraftTokensE", false]], "tensorrt_llm::executor::requestperfmetrics::speculativedecodingmetrics::totaldrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics16totalDraftTokensE", false]], "tensorrt_llm::executor::requestperfmetrics::timepoint (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics9TimePointE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13timingMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetricsE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::arrivaltime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics11arrivalTimeE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::firstscheduledtime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics18firstScheduledTimeE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::firsttokentime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics14firstTokenTimeE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::kvcachesize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics11kvCacheSizeE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::kvcachetransferend (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics18kvCacheTransferEndE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::kvcachetransferstart (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics20kvCacheTransferStartE", false]], "tensorrt_llm::executor::requestperfmetrics::timingmetrics::lasttokentime (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics13lastTokenTimeE", false]], "tensorrt_llm::executor::requeststage (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStageE", false]], "tensorrt_llm::executor::requeststage::kcontext_in_progress (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage20kCONTEXT_IN_PROGRESSE", false]], "tensorrt_llm::executor::requeststage::kencoder_in_progress (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage20kENCODER_IN_PROGRESSE", false]], "tensorrt_llm::executor::requeststage::kgeneration_complete (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage20kGENERATION_COMPLETEE", false]], "tensorrt_llm::executor::requeststage::kgeneration_in_progress (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage23kGENERATION_IN_PROGRESSE", false]], "tensorrt_llm::executor::requeststage::kqueued (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStage7kQUEUEDE", false]], "tensorrt_llm::executor::requeststats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStatsE", false]], "tensorrt_llm::executor::requeststats::allocnewblocksperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats24allocNewBlocksPerRequestE", false]], "tensorrt_llm::executor::requeststats::alloctotalblocksperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats26allocTotalBlocksPerRequestE", false]], "tensorrt_llm::executor::requeststats::avgnumdecodedtokensperiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats26avgNumDecodedTokensPerIterE", false]], "tensorrt_llm::executor::requeststats::contextprefillposition (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats22contextPrefillPositionE", false]], "tensorrt_llm::executor::requeststats::disservingstats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats15disServingStatsE", false]], "tensorrt_llm::executor::requeststats::id (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats2idE", false]], "tensorrt_llm::executor::requeststats::kvcachehitrateperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats24kvCacheHitRatePerRequestE", false]], "tensorrt_llm::executor::requeststats::missedblocksperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats22missedBlocksPerRequestE", false]], "tensorrt_llm::executor::requeststats::numgeneratedtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats18numGeneratedTokensE", false]], "tensorrt_llm::executor::requeststats::paused (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats6pausedE", false]], "tensorrt_llm::executor::requeststats::reusedblocksperrequest (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats22reusedBlocksPerRequestE", false]], "tensorrt_llm::executor::requeststats::scheduled (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats9scheduledE", false]], "tensorrt_llm::executor::requeststats::stage (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor12RequestStats5stageE", false]], "tensorrt_llm::executor::requeststatsperiteration (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIterationE", false]], "tensorrt_llm::executor::requeststatsperiteration::iter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIteration4iterE", false]], "tensorrt_llm::executor::requeststatsperiteration::requeststats (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIteration12requestStatsE", false]], "tensorrt_llm::executor::requesttype (c++ enum)": [[0, "_CPPv4N12tensorrt_llm8executor11RequestTypeE", false]], "tensorrt_llm::executor::requesttype::request_type_context_and_generation (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor11RequestType35REQUEST_TYPE_CONTEXT_AND_GENERATIONE", false]], "tensorrt_llm::executor::requesttype::request_type_context_only (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor11RequestType25REQUEST_TYPE_CONTEXT_ONLYE", false]], "tensorrt_llm::executor::requesttype::request_type_generation_only (c++ enumerator)": [[0, "_CPPv4N12tensorrt_llm8executor11RequestType28REQUEST_TYPE_GENERATION_ONLYE", false]], "tensorrt_llm::executor::response (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor8ResponseE", false]], "tensorrt_llm::executor::response::getclientid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response11getClientIdEv", false]], "tensorrt_llm::executor::response::geterrormsg (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response11getErrorMsgEv", false]], "tensorrt_llm::executor::response::getrequestid (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response12getRequestIdEv", false]], "tensorrt_llm::executor::response::getresult (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response9getResultEv", false]], "tensorrt_llm::executor::response::haserror (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor8Response8hasErrorEv", false]], "tensorrt_llm::executor::response::mimpl (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor8Response5mImplE", false]], "tensorrt_llm::executor::response::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8ResponseaSERK8Response", false], [0, "_CPPv4N12tensorrt_llm8executor8ResponseaSERR8Response", false]], "tensorrt_llm::executor::response::response (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", false], [0, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERK8Response", false], [0, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERR8Response", false]], "tensorrt_llm::executor::response::~response (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor8ResponseD0Ev", false]], "tensorrt_llm::executor::result (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor6ResultE", false]], "tensorrt_llm::executor::result::additionaloutputs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result17additionalOutputsE", false]], "tensorrt_llm::executor::result::contextlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result13contextLogitsE", false]], "tensorrt_llm::executor::result::contextphaseparams (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result18contextPhaseParamsE", false]], "tensorrt_llm::executor::result::cumlogprobs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result11cumLogProbsE", false]], "tensorrt_llm::executor::result::decodingiter (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result12decodingIterE", false]], "tensorrt_llm::executor::result::encoderoutput (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result13encoderOutputE", false]], "tensorrt_llm::executor::result::finishreasons (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result13finishReasonsE", false]], "tensorrt_llm::executor::result::generationlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result16generationLogitsE", false]], "tensorrt_llm::executor::result::isfinal (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result7isFinalE", false]], "tensorrt_llm::executor::result::issequencefinal (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result15isSequenceFinalE", false]], "tensorrt_llm::executor::result::logprobs (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result8logProbsE", false]], "tensorrt_llm::executor::result::outputtokenids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result14outputTokenIdsE", false]], "tensorrt_llm::executor::result::requestperfmetrics (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result18requestPerfMetricsE", false]], "tensorrt_llm::executor::result::sequenceindex (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result13sequenceIndexE", false]], "tensorrt_llm::executor::result::specdecfastlogitsinfo (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Result21specDecFastLogitsInfoE", false]], "tensorrt_llm::executor::retentionpriority (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor17RetentionPriorityE", false]], "tensorrt_llm::executor::retentionpriorityandduration (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDurationE", false]], "tensorrt_llm::executor::retentionpriorityandduration::durationms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration10durationMsE", false]], "tensorrt_llm::executor::retentionpriorityandduration::retentionpriority (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration17retentionPriorityE", false]], "tensorrt_llm::executor::retentionpriorityandduration::retentionpriorityandduration (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration28RetentionPriorityAndDurationERKNSt8optionalI17RetentionPriorityEERKNSt8optionalINSt6chrono12millisecondsEEE", false]], "tensorrt_llm::executor::samplingconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfigE", false]], "tensorrt_llm::executor::samplingconfig::checkbeamsearchdiversityrate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig28checkBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checkbeamwidth (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkBeamWidthE10SizeType32", false]], "tensorrt_llm::executor::samplingconfig::checkbeamwidtharray (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19checkBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEEK10SizeType32", false]], "tensorrt_llm::executor::samplingconfig::checkearlystopping (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkEarlyStoppingERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::checklengthpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkLengthPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checkminp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkMinPERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checkmintokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkMinTokensERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::checknorepeatngramsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::checknumreturnsequences (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig23checkNumReturnSequencesERKNSt8optionalI10SizeType32EE10SizeType32", false]], "tensorrt_llm::executor::samplingconfig::checkrepetitionpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktemperature (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16checkTemperatureERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktopk (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopKERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktopp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopPERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktoppdecay (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkTopPDecayERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktoppmin (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12checkTopPMinERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::checktoppresetids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17checkTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", false]], "tensorrt_llm::executor::samplingconfig::getbeamsearchdiversityrate (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig26getBeamSearchDiversityRateEv", false]], "tensorrt_llm::executor::samplingconfig::getbeamwidth (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getBeamWidthEv", false]], "tensorrt_llm::executor::samplingconfig::getbeamwidtharray (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig17getBeamWidthArrayEv", false]], "tensorrt_llm::executor::samplingconfig::getearlystopping (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig16getEarlyStoppingEv", false]], "tensorrt_llm::executor::samplingconfig::getfrequencypenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig19getFrequencyPenaltyEv", false]], "tensorrt_llm::executor::samplingconfig::getlengthpenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig16getLengthPenaltyEv", false]], "tensorrt_llm::executor::samplingconfig::getminp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getMinPEv", false]], "tensorrt_llm::executor::samplingconfig::getmintokens (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getMinTokensEv", false]], "tensorrt_llm::executor::samplingconfig::getnorepeatngramsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig20getNoRepeatNgramSizeEv", false]], "tensorrt_llm::executor::samplingconfig::getnumreturnbeams (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig17getNumReturnBeamsEv", false]], "tensorrt_llm::executor::samplingconfig::getnumreturnsequences (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig21getNumReturnSequencesEv", false]], "tensorrt_llm::executor::samplingconfig::getpresencepenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig18getPresencePenaltyEv", false]], "tensorrt_llm::executor::samplingconfig::getrepetitionpenalty (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig20getRepetitionPenaltyEv", false]], "tensorrt_llm::executor::samplingconfig::getseed (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getSeedEv", false]], "tensorrt_llm::executor::samplingconfig::gettemperature (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig14getTemperatureEv", false]], "tensorrt_llm::executor::samplingconfig::gettopk (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getTopKEv", false]], "tensorrt_llm::executor::samplingconfig::gettopp (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getTopPEv", false]], "tensorrt_llm::executor::samplingconfig::gettoppdecay (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getTopPDecayEv", false]], "tensorrt_llm::executor::samplingconfig::gettoppmin (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig10getTopPMinEv", false]], "tensorrt_llm::executor::samplingconfig::gettoppresetids (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig15getTopPResetIdsEv", false]], "tensorrt_llm::executor::samplingconfig::mbeamsearchdiversityrate (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig24mBeamSearchDiversityRateE", false]], "tensorrt_llm::executor::samplingconfig::mbeamwidth (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mBeamWidthE", false]], "tensorrt_llm::executor::samplingconfig::mbeamwidtharray (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15mBeamWidthArrayE", false]], "tensorrt_llm::executor::samplingconfig::mearlystopping (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14mEarlyStoppingE", false]], "tensorrt_llm::executor::samplingconfig::mfrequencypenalty (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17mFrequencyPenaltyE", false]], "tensorrt_llm::executor::samplingconfig::mlengthpenalty (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14mLengthPenaltyE", false]], "tensorrt_llm::executor::samplingconfig::mminp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mMinPE", false]], "tensorrt_llm::executor::samplingconfig::mmintokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mMinTokensE", false]], "tensorrt_llm::executor::samplingconfig::mnorepeatngramsize (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18mNoRepeatNgramSizeE", false]], "tensorrt_llm::executor::samplingconfig::mnumreturnbeams (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15mNumReturnBeamsE", false]], "tensorrt_llm::executor::samplingconfig::mnumreturnsequences (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19mNumReturnSequencesE", false]], "tensorrt_llm::executor::samplingconfig::mpresencepenalty (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16mPresencePenaltyE", false]], "tensorrt_llm::executor::samplingconfig::mrepetitionpenalty (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18mRepetitionPenaltyE", false]], "tensorrt_llm::executor::samplingconfig::mseed (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mSeedE", false]], "tensorrt_llm::executor::samplingconfig::mtemperature (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12mTemperatureE", false]], "tensorrt_llm::executor::samplingconfig::mtopk (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mTopKE", false]], "tensorrt_llm::executor::samplingconfig::mtopp (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mTopPE", false]], "tensorrt_llm::executor::samplingconfig::mtoppdecay (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mTopPDecayE", false]], "tensorrt_llm::executor::samplingconfig::mtoppmin (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig8mTopPMinE", false]], "tensorrt_llm::executor::samplingconfig::mtoppresetids (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig13mTopPResetIdsE", false]], "tensorrt_llm::executor::samplingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor14SamplingConfigeqERK14SamplingConfig", false]], "tensorrt_llm::executor::samplingconfig::samplingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", false]], "tensorrt_llm::executor::samplingconfig::setbeamsearchdiversityrate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig26setBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setbeamwidth (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setBeamWidthE10SizeType32", false]], "tensorrt_llm::executor::samplingconfig::setbeamwidtharray (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17setBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEE", false]], "tensorrt_llm::executor::samplingconfig::setearlystopping (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setEarlyStoppingERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::setfrequencypenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19setFrequencyPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setlengthpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setLengthPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setminp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setMinPERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setmintokens (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setMinTokensERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::setnorepeatngramsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::setnumreturnsequences (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig21setNumReturnSequencesERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::setpresencepenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18setPresencePenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setrepetitionpenalty (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::setseed (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setSeedERKNSt8optionalI14RandomSeedTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settemperature (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14setTemperatureERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settopk (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopKERKNSt8optionalI10SizeType32EE", false]], "tensorrt_llm::executor::samplingconfig::settopp (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopPERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settoppdecay (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setTopPDecayERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settoppmin (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10setTopPMinERKNSt8optionalI9FloatTypeEE", false]], "tensorrt_llm::executor::samplingconfig::settoppresetids (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15setTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", false]], "tensorrt_llm::executor::samplingconfig::updatenumreturnbeams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20updateNumReturnBeamsEv", false]], "tensorrt_llm::executor::schedulerconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfigE", false]], "tensorrt_llm::executor::schedulerconfig::getcapacityschedulerpolicy (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig26getCapacitySchedulerPolicyEv", false]], "tensorrt_llm::executor::schedulerconfig::getcontextchunkingpolicy (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig24getContextChunkingPolicyEv", false]], "tensorrt_llm::executor::schedulerconfig::getdynamicbatchconfig (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig21getDynamicBatchConfigEv", false]], "tensorrt_llm::executor::schedulerconfig::mcapacityschedulerpolicy (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig24mCapacitySchedulerPolicyE", false]], "tensorrt_llm::executor::schedulerconfig::mcontextchunkingpolicy (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig22mContextChunkingPolicyE", false]], "tensorrt_llm::executor::schedulerconfig::mdynamicbatchconfig (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig19mDynamicBatchConfigE", false]], "tensorrt_llm::executor::schedulerconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfigeqERK15SchedulerConfig", false]], "tensorrt_llm::executor::schedulerconfig::schedulerconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", false]], "tensorrt_llm::executor::serialization (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor13SerializationE", false]], "tensorrt_llm::executor::serialization::deserializeadditionalmodeloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeAdditionalModelOutputERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeadditionaloutput (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization27deserializeAdditionalOutputERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeagentstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeAgentStateERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializebool (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization15deserializeBoolERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializecachestate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializecachetransceiverconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeCacheTransceiverConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializecommstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeCommStateERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializecontextphaseparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeContextPhaseParamsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedatatransceiverstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt6vectorIcEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedebugconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeDebugConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeDecodingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedecodingmode (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeDecodingModeERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedisservingrequeststats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeDisServingRequestStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializedynamicbatchconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeDynamicBatchConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeeagleconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeEagleConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeexecutorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeExecutorConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeextendedruntimeperfknobconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization40deserializeExtendedRuntimePerfKnobConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeexternaldrafttokensconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeExternalDraftTokensConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeguideddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeguideddecodingparams (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingParamsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeinflightbatchingstats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeInflightBatchingStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeiterationstats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeiterationstatsvec (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization28deserializeIterationStatsVecERNSt6vectorIcEE", false]], "tensorrt_llm::executor::serialization::deserializekvcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization24deserializeKvCacheConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializekvcacheretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeKvCacheRetentionConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializekvcachestats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeKvCacheStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializelookaheaddecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization34deserializeLookaheadDecodingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeloraconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeLoraConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializemodeltype (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeModelTypeERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializemropeconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeMropeConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeorchestratorconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeOrchestratorConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeoutputconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeOutputConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeparallelconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeParallelConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializepeftcacheconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializePeftCacheConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeprompttuningconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializePromptTuningConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequest (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization18deserializeRequestERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequestperfmetrics (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeRequestPerfMetricsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequeststage (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStageERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequeststats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequeststatsperiteration (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt6vectorIcEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializerequeststatsperiterationvec (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization38deserializeRequestStatsPerIterationVecERNSt6vectorIcEE", false]], "tensorrt_llm::executor::serialization::deserializeresponse (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization19deserializeResponseERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeresponses (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeResponsesERNSt6vectorIcEE", false]], "tensorrt_llm::executor::serialization::deserializeresult (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeResultERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializesamplingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeSamplingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializeschedulerconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializeSchedulerConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializesocketstate (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeSocketStateERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializespecdecfastlogitsinfo (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeSpecDecFastLogitsInfoERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializespecdecodingstats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization28deserializeSpecDecodingStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializespeculativedecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeSpeculativeDecodingConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializestaticbatchingstats (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization30deserializeStaticBatchingStatsERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializestring (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeStringERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializetensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeTensorERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializetimepoint (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeTimePointERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::deserializetokenrangeretentionconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeTokenRangeRetentionConfigERNSt7istreamE", false]], "tensorrt_llm::executor::serialization::serialize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverState", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIteration", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI14IterationStatsEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI24RequestStatsPerIterationEE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI8ResponseEE", false]], "tensorrt_llm::executor::serialization::serializedsize (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK10LoraConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11DebugConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11EagleConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11MropeConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12DecodingMode", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12KvCacheStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12OutputConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStage", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK13KvCacheConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14DecodingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ExecutorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14IterationStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ParallelConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14SamplingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15PeftCacheConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15SchedulerConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK16AdditionalOutput", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK17SpecDecodingStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18ContextPhaseParams", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18DynamicBatchConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18OrchestratorConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18PromptTuningConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18RequestPerfMetrics", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK19StaticBatchingStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20DataTransceiverState", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingParams", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21AdditionalModelOutput", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22CacheTransceiverConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22DisServingRequestStats", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22KvCacheRetentionConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK23LookaheadDecodingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK24RequestStatsPerIteration", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25ExternalDraftTokensConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25SpeculativeDecodingConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK29ExtendedRuntimePerfKnobConfig", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK33SpeculativeDecodingFastLogitsInfo", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Result", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK7Request", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK8Response", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN18RequestPerfMetrics9TimePointE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10AgentStateE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10CacheStateE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE", false], [0, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache9CommStateE", false]], "tensorrt_llm::executor::shape (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor5ShapeE", false]], "tensorrt_llm::executor::shape::base (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor5Shape4BaseE", false]], "tensorrt_llm::executor::shape::dimtype64 (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor5Shape9DimType64E", false]], "tensorrt_llm::executor::shape::shape (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeENSt16initializer_listI9DimType64EE", false], [0, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEPK9DimType64N4Base9size_typeE", false], [0, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEv", false]], "tensorrt_llm::executor::sizetype32 (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor10SizeType32E", false]], "tensorrt_llm::executor::sizetype64 (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor10SizeType64E", false]], "tensorrt_llm::executor::specdecodingstats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE", false]], "tensorrt_llm::executor::specdecodingstats::acceptancelength (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats16acceptanceLengthE", false]], "tensorrt_llm::executor::specdecodingstats::draftoverhead (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13draftOverheadE", false]], "tensorrt_llm::executor::specdecodingstats::iterlatencyms (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13iterLatencyMSE", false]], "tensorrt_llm::executor::specdecodingstats::numacceptedtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats17numAcceptedTokensE", false]], "tensorrt_llm::executor::specdecodingstats::numdrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats14numDraftTokensE", false]], "tensorrt_llm::executor::specdecodingstats::numrequestswithdrafttokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats26numRequestsWithDraftTokensE", false]], "tensorrt_llm::executor::speculativedecodingconfig (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfigE", false]], "tensorrt_llm::executor::speculativedecodingconfig::fastlogits (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig10fastLogitsE", false]], "tensorrt_llm::executor::speculativedecodingconfig::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor25SpeculativeDecodingConfigeqERK25SpeculativeDecodingConfig", false]], "tensorrt_llm::executor::speculativedecodingconfig::speculativedecodingconfig (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig25SpeculativeDecodingConfigEb", false]], "tensorrt_llm::executor::speculativedecodingfastlogitsinfo (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfoE", false]], "tensorrt_llm::executor::speculativedecodingfastlogitsinfo::draftparticipantid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo18draftParticipantIdE", false]], "tensorrt_llm::executor::speculativedecodingfastlogitsinfo::draftrequestid (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo14draftRequestIdE", false]], "tensorrt_llm::executor::speculativedecodingfastlogitsinfo::totensor (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo8toTensorEv", false]], "tensorrt_llm::executor::staticbatchingstats (c++ struct)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStatsE", false]], "tensorrt_llm::executor::staticbatchingstats::emptygenslots (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats13emptyGenSlotsE", false]], "tensorrt_llm::executor::staticbatchingstats::numcontextrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats18numContextRequestsE", false]], "tensorrt_llm::executor::staticbatchingstats::numctxtokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats12numCtxTokensE", false]], "tensorrt_llm::executor::staticbatchingstats::numgentokens (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats12numGenTokensE", false]], "tensorrt_llm::executor::staticbatchingstats::numscheduledrequests (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats20numScheduledRequestsE", false]], "tensorrt_llm::executor::streamptr (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor9StreamPtrE", false]], "tensorrt_llm::executor::tensor (c++ class)": [[0, "_CPPv4N12tensorrt_llm8executor6TensorE", false]], "tensorrt_llm::executor::tensor::copyto (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor6copyToENSt10shared_ptrI4ImplEE13CudaStreamPtr", false]], "tensorrt_llm::executor::tensor::copytocpu (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToCpuEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::copytogpu (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToGpuEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::copytomanaged (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor13copyToManagedEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::copytopinned (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor12copyToPinnedEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::copytopooledpinned (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor18copyToPooledPinnedEN6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::cpu (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3cpuE6Tensor5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor3cpuE8DataType5Shape", false]], "tensorrt_llm::executor::tensor::cudastreamptr (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor13CudaStreamPtrE", false]], "tensorrt_llm::executor::tensor::detail::ofitensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", false]], "tensorrt_llm::executor::tensor::detail::toitensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9toITensorERK6Tensor", false]], "tensorrt_llm::executor::tensor::getdata (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor7getDataEv", false], [0, "_CPPv4NK12tensorrt_llm8executor6Tensor7getDataEv", false]], "tensorrt_llm::executor::tensor::getdatatype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor11getDataTypeEv", false]], "tensorrt_llm::executor::tensor::getmemorytype (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor13getMemoryTypeEv", false]], "tensorrt_llm::executor::tensor::getruntimetype (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor14getRuntimeTypeE8DataTypev", false]], "tensorrt_llm::executor::tensor::getshape (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor8getShapeEv", false]], "tensorrt_llm::executor::tensor::getsize (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor7getSizeEv", false]], "tensorrt_llm::executor::tensor::getsizeinbytes (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6Tensor14getSizeInBytesEv", false]], "tensorrt_llm::executor::tensor::gpu (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", false]], "tensorrt_llm::executor::tensor::impl (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor4ImplE", false]], "tensorrt_llm::executor::tensor::managed (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor7managedE6Tensor5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor7managedE8DataType5Shape", false]], "tensorrt_llm::executor::tensor::mtensor (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor7mTensorE", false]], "tensorrt_llm::executor::tensor::of (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", false], [0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorR1T", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", false]], "tensorrt_llm::executor::tensor::operator bool (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6TensorcvbEv", false]], "tensorrt_llm::executor::tensor::operator!= (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6TensorneERK6Tensor", false]], "tensorrt_llm::executor::tensor::operator= (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6TensoraSERK6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor6TensoraSERR6Tensor", false]], "tensorrt_llm::executor::tensor::operator== (c++ function)": [[0, "_CPPv4NK12tensorrt_llm8executor6TensoreqERK6Tensor", false]], "tensorrt_llm::executor::tensor::pinned (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor6pinnedE6Tensor5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor6pinnedE8DataType5Shape", false]], "tensorrt_llm::executor::tensor::pooledpinned (c++ function)": [[0, "_CPPv4I0EN12tensorrt_llm8executor6Tensor12pooledPinnedE6Tensor5Shape", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor12pooledPinnedE8DataType5Shape", false]], "tensorrt_llm::executor::tensor::setfrom (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor7setFromERK6Tensor13CudaStreamPtr", false]], "tensorrt_llm::executor::tensor::setzero (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor7setZeroE13CudaStreamPtr", false]], "tensorrt_llm::executor::tensor::tensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorENSt10shared_ptrIN7runtime7ITensorEEE", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERK6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERR6Tensor", false], [0, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorEv", false]], "tensorrt_llm::executor::tensor::~tensor (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor6TensorD0Ev", false]], "tensorrt_llm::executor::tensorptr (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor9TensorPtrE", false]], "tensorrt_llm::executor::tokenidtype (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor11TokenIdTypeE", false]], "tensorrt_llm::executor::typetraits (c++ struct)": [[0, "_CPPv4I0_bEN12tensorrt_llm8executor10TypeTraitsE", false]], "tensorrt_llm::executor::typetraits<bool> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsIbEE", false]], "tensorrt_llm::executor::typetraits<bool>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIbE5valueE", false]], "tensorrt_llm::executor::typetraits<float> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsIfEE", false]], "tensorrt_llm::executor::typetraits<float>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIfE5valueE", false]], "tensorrt_llm::executor::typetraits<half> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsI4halfEE", false]], "tensorrt_llm::executor::typetraits<half>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsI4halfE5valueE", false]], "tensorrt_llm::executor::typetraits<std::int32_t> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7int32_tEEE", false]], "tensorrt_llm::executor::typetraits<std::int32_t>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7int32_tEE5valueE", false]], "tensorrt_llm::executor::typetraits<std::int64_t> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7int64_tEEE", false]], "tensorrt_llm::executor::typetraits<std::int64_t>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7int64_tEE5valueE", false]], "tensorrt_llm::executor::typetraits<std::int8_t> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt6int8_tEEE", false]], "tensorrt_llm::executor::typetraits<std::int8_t>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt6int8_tEE5valueE", false]], "tensorrt_llm::executor::typetraits<std::uint8_t> (c++ struct)": [[0, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7uint8_tEEE", false]], "tensorrt_llm::executor::typetraits<std::uint8_t>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7uint8_tEE5valueE", false]], "tensorrt_llm::executor::typetraits<t*> (c++ struct)": [[0, "_CPPv4I0EN12tensorrt_llm8executor10TypeTraitsIP1TEE", false]], "tensorrt_llm::executor::typetraits<t*>::value (c++ member)": [[0, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIP1TE5valueE", false]], "tensorrt_llm::executor::veclogprobs (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor11VecLogProbsE", false]], "tensorrt_llm::executor::vectokenextraids (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor16VecTokenExtraIdsE", false]], "tensorrt_llm::executor::vectokens (c++ type)": [[0, "_CPPv4N12tensorrt_llm8executor9VecTokensE", false]], "tensorrt_llm::executor::version (c++ function)": [[0, "_CPPv4N12tensorrt_llm8executor7versionEv", false]], "tensorrt_llm::layers (c++ type)": [[1, "_CPPv4N12tensorrt_llm6layersE", false]], "tensorrt_llm::mpi (c++ type)": [[0, "_CPPv4N12tensorrt_llm3mpiE", false]], "tensorrt_llm::runtime (c++ type)": [[0, "_CPPv4N12tensorrt_llm7runtimeE", false], [1, "_CPPv4N12tensorrt_llm7runtimeE", false]], "tensorrt_llm::runtime::allreducebuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffersE", false]], "tensorrt_llm::runtime::allreducebuffers::allreducebuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", false]], "tensorrt_llm::runtime::allreducebuffers::mallreducecommptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE", false]], "tensorrt_llm::runtime::allreducebuffers::mflagptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE", false]], "tensorrt_llm::runtime::allreducebuffers::mipcmemoryhandles (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE", false]], "tensorrt_llm::runtime::allreducebuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::buffercast (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEP1TR7IBuffer", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEPK1TRK7IBuffer", false]], "tensorrt_llm::runtime::buffercastornull (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7IBuffer9SharedPtrE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7ITensor9SharedPtrE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7IBuffer9SharedPtrEEE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7ITensor9SharedPtrEEE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7IBuffer14SharedConstPtrE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7ITensor14SharedConstPtrE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7IBuffer14SharedConstPtrEEE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7ITensor14SharedConstPtrEEE", false]], "tensorrt_llm::runtime::bufferdatatype (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataTypeE", false]], "tensorrt_llm::runtime::bufferdatatype::bufferdatatype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", false]], "tensorrt_llm::runtime::bufferdatatype::getdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType11getDataTypeEv", false]], "tensorrt_llm::runtime::bufferdatatype::getsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType7getSizeEv", false]], "tensorrt_llm::runtime::bufferdatatype::getsizeinbits (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType13getSizeInBitsEv", false]], "tensorrt_llm::runtime::bufferdatatype::ispointer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType9isPointerEv", false]], "tensorrt_llm::runtime::bufferdatatype::isunsigned (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType10isUnsignedEv", false]], "tensorrt_llm::runtime::bufferdatatype::ktrtpointertype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType15kTrtPointerTypeE", false]], "tensorrt_llm::runtime::bufferdatatype::mdatatype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType9mDataTypeE", false]], "tensorrt_llm::runtime::bufferdatatype::mpointer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType8mPointerE", false]], "tensorrt_llm::runtime::bufferdatatype::munsigned (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType9mUnsignedE", false]], "tensorrt_llm::runtime::bufferdatatype::operator nvinfer1::datatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataTypecvN8nvinfer18DataTypeEEv", false]], "tensorrt_llm::runtime::buffermanager (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManagerE", false]], "tensorrt_llm::runtime::buffermanager::allocate (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::buffermanager (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13BufferManagerE13CudaStreamPtrb", false]], "tensorrt_llm::runtime::buffermanager::copy (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferR7IBuffer", false]], "tensorrt_llm::runtime::buffermanager::copyfrom (c++ function)": [[1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", false], [1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", false], [1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7IBuffer10MemoryType", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7ITensor10MemoryType", false]], "tensorrt_llm::runtime::buffermanager::cpu (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::cudamempoolptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager14CudaMemPoolPtrE", false]], "tensorrt_llm::runtime::buffermanager::cudastreamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13CudaStreamPtrE", false]], "tensorrt_llm::runtime::buffermanager::emptybuffer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyBufferE10MemoryTypeN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::emptytensor (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyTensorE10MemoryTypeN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::getstream (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager9getStreamEv", false]], "tensorrt_llm::runtime::buffermanager::gpu (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::gpusync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::ibufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10IBufferPtrE", false]], "tensorrt_llm::runtime::buffermanager::ipcnvls (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::itensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10ITensorPtrE", false]], "tensorrt_llm::runtime::buffermanager::kbyte_type (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10kBYTE_TYPEE", false]], "tensorrt_llm::runtime::buffermanager::managed (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::memorypoolfree (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager14memoryPoolFreeEv", false]], "tensorrt_llm::runtime::buffermanager::memorypoolreserved (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager18memoryPoolReservedEv", false]], "tensorrt_llm::runtime::buffermanager::memorypooltrimto (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager16memoryPoolTrimToENSt6size_tE", false]], "tensorrt_llm::runtime::buffermanager::memorypoolused (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager14memoryPoolUsedEv", false]], "tensorrt_llm::runtime::buffermanager::mpool (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager5mPoolE", false]], "tensorrt_llm::runtime::buffermanager::mstream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7mStreamE", false]], "tensorrt_llm::runtime::buffermanager::mtrimpool (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager9mTrimPoolE", false]], "tensorrt_llm::runtime::buffermanager::pinned (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::pinnedpool (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolEN8nvinfer14DimsEN8nvinfer18DataTypeE", false], [1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolENSt6size_tEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::buffermanager::setmem (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager6setMemER7IBuffer7int32_t", false]], "tensorrt_llm::runtime::buffermanager::setzero (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager7setZeroER7IBuffer", false]], "tensorrt_llm::runtime::buffermanager::~buffermanager (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13BufferManagerD0Ev", false]], "tensorrt_llm::runtime::bufferrange (c++ class)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime11BufferRangeE", false]], "tensorrt_llm::runtime::bufferrange::base (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime11BufferRange4BaseE", false]], "tensorrt_llm::runtime::bufferrange::bufferrange (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI1UEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeERK7IBuffer", false], [1, "_CPPv4I0_NSt11enable_if_tIXntNSt10is_const_vI1UEEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeER7IBuffer", false], [1, "_CPPv4N12tensorrt_llm7runtime11BufferRange11BufferRangeEP1T9size_type", false]], "tensorrt_llm::runtime::canaccesspeer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13canAccessPeerERK11WorldConfig", false]], "tensorrt_llm::runtime::constpointercast (c++ function)": [[1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERKNSt10shared_ptrI1TEE", false]], "tensorrt_llm::runtime::cudaevent (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEventE", false]], "tensorrt_llm::runtime::cudaevent::cudaevent (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventE7pointerb", false], [1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventEj", false]], "tensorrt_llm::runtime::cudaevent::deleter (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7DeleterE", false]], "tensorrt_llm::runtime::cudaevent::deleter::deleter (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEb", false], [1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEv", false]], "tensorrt_llm::runtime::cudaevent::deleter::mownsevent (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter10mOwnsEventE", false]], "tensorrt_llm::runtime::cudaevent::deleter::operator() (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent7DeleterclE7pointer", false]], "tensorrt_llm::runtime::cudaevent::element_type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent12element_typeE", false]], "tensorrt_llm::runtime::cudaevent::eventptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent8EventPtrE", false]], "tensorrt_llm::runtime::cudaevent::get (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent3getEv", false]], "tensorrt_llm::runtime::cudaevent::mevent (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent6mEventE", false]], "tensorrt_llm::runtime::cudaevent::pointer (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7pointerE", false]], "tensorrt_llm::runtime::cudaevent::synchronize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent11synchronizeEv", false]], "tensorrt_llm::runtime::cudastream (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStreamE", false]], "tensorrt_llm::runtime::cudastream::cudastream (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_t", false], [1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", false], [1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamEji", false]], "tensorrt_llm::runtime::cudastream::deleter (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7DeleterE", false]], "tensorrt_llm::runtime::cudastream::deleter::deleter (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEb", false], [1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEv", false]], "tensorrt_llm::runtime::cudastream::deleter::mownsstream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter11mOwnsStreamE", false]], "tensorrt_llm::runtime::cudastream::deleter::operator() (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream7DeleterclE12cudaStream_t", false]], "tensorrt_llm::runtime::cudastream::get (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream3getEv", false]], "tensorrt_llm::runtime::cudastream::getdevice (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream9getDeviceEv", false]], "tensorrt_llm::runtime::cudastream::mdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7mDeviceE", false]], "tensorrt_llm::runtime::cudastream::mstream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7mStreamE", false]], "tensorrt_llm::runtime::cudastream::record (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordEN9CudaEvent7pointerE", false], [1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordERK9CudaEvent", false]], "tensorrt_llm::runtime::cudastream::streamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10CudaStream9StreamPtrE", false]], "tensorrt_llm::runtime::cudastream::synchronize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream11synchronizeEv", false]], "tensorrt_llm::runtime::cudastream::wait (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitEN9CudaEvent7pointerE", false], [1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitERK9CudaEvent", false]], "tensorrt_llm::runtime::datatypetraits (c++ struct)": [[1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", false]], "tensorrt_llm::runtime::datatypetraits<kdatatype, kunsigned, true> (c++ struct)": [[1, "_CPPv4I_N8nvinfer18DataTypeE_bEN12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEEE", false]], "tensorrt_llm::runtime::datatypetraits<kdatatype, kunsigned, true>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<kdatatype, kunsigned, true>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<kdatatype, kunsigned, true>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kbool, kunsigned> (c++ struct)": [[1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kbool, kunsigned>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kbool, kunsigned>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kbool, kunsigned>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kfloat> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kfloat>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kfloat>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kfloat>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::khalf> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::khalf>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::khalf>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::khalf>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32, true> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32, true>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32, true>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32, true>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint32>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64, true> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64, true>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64, true>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64, true>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint64>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint8> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint8>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint8>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kint8>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4typeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kuint8, kunsigned> (c++ struct)": [[1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedEE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kuint8, kunsigned>::name (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4nameE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kuint8, kunsigned>::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4sizeE", false]], "tensorrt_llm::runtime::datatypetraits<nvinfer1::datatype::kuint8, kunsigned>::type (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4typeE", false]], "tensorrt_llm::runtime::decoder (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoderE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffersE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::beamsearchbuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers17BeamSearchBuffersERK13BufferManager", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::mcumlogprobstmp (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers15mCumLogProbsTmpE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::mnumsms (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7mNumSMsE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::moutputbeamhypotheses (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers21mOutputBeamHypothesesE", false]], "tensorrt_llm::runtime::decoder::beamsearchbuffers::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7reshapeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::decoder::decoderstate (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE", false]], "tensorrt_llm::runtime::decoder::decoderstate::allocatespeculativedecodingbuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", false]], "tensorrt_llm::runtime::decoder::decoderstate::decoderstate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState12DecoderStateEN8nvinfer18DataTypeERK13BufferManager", false]], "tensorrt_llm::runtime::decoder::decoderstate::decodinginputptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16DecodingInputPtrE", false]], "tensorrt_llm::runtime::decoder::decoderstate::decodingoutputptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState17DecodingOutputPtrE", false]], "tensorrt_llm::runtime::decoder::decoderstate::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16disableLookaheadERK13RequestVector", false]], "tensorrt_llm::runtime::decoder::decoderstate::getacceptedlengthscumsum (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24getAcceptedLengthsCumSumEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getacceptedpackedpaths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getAcceptedPackedPathsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getallnewtokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getbeamsearchbuffers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getBeamSearchBuffersEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getcumlogprobs (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getfinishedsteps (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishedStepsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getfinishedsum (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getFinishedSumEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getfinishreasons (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishReasonsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getgatheredids (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getids (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getjointdecodinginput (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState21getJointDecodingInputEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getjointdecodingoutput (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getJointDecodingOutputEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getlogprobs (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getmaxbatchsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getmaxbeamwidth (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getmaxdecodingdecodertokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState27getMaxDecodingDecoderTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getmaxdecodingenginetokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getMaxDecodingEngineTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getmaxsequencelength (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getMaxSequenceLengthEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getnextdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getNextDraftTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getnextdrafttokenslengths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getNextDraftTokensLengthsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getnumdecodingenginetokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getparentids (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState12getParentIdsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getprevdrafttokenslengths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getPrevDraftTokensLengthsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getsequencelengths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32", false], [1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::getspeculativedecodingmode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getSpeculativeDecodingModeEv", false]], "tensorrt_llm::runtime::decoder::decoderstate::llmrequestptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13LlmRequestPtrE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mbeamsearchbuffers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState18mBeamSearchBuffersE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mfinishedsteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState14mFinishedStepsE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mjointdecodinginput (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState19mJointDecodingInputE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mjointdecodingoutput (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState20mJointDecodingOutputE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxbatchsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13mMaxBatchSizeE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxbeamwidth (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13mMaxBeamWidthE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxdecodingdecodertokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState25mMaxDecodingDecoderTokensE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxdecodingenginetokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mMaxDecodingEngineTokensE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mmaxsequencelength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState18mMaxSequenceLengthE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mnumdecodingenginetokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mNumDecodingEngineTokensE", false]], "tensorrt_llm::runtime::decoder::decoderstate::mspeculativedecodingmode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mSpeculativeDecodingModeE", false]], "tensorrt_llm::runtime::decoder::decoderstate::requestvector (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13RequestVectorE", false]], "tensorrt_llm::runtime::decoder::decoderstate::setnumdecodingenginetokens (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState26setNumDecodingEngineTokensE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::decoder::decoderstate::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", false]], "tensorrt_llm::runtime::decoder::decoderstate::setupeagle (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState10setupEagleEN12EagleBuffers6InputsE", false]], "tensorrt_llm::runtime::decoder::decoderstate::setupexplicitdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24setupExplicitDraftTokensEN26ExplicitDraftTokensBuffers6InputsE", false]], "tensorrt_llm::runtime::decoder::decoderstate::setuplookahead (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14setupLookaheadE24LookaheadDecodingBuffers", false]], "tensorrt_llm::runtime::decoder::decoderstate::setupspeculativedecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", false]], "tensorrt_llm::runtime::decoder::decoderstate::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState9TensorPtrE", false]], "tensorrt_llm::runtime::decoder_batch (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batchE", false]], "tensorrt_llm::runtime::decoder_batch::input (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE", false]], "tensorrt_llm::runtime::decoder_batch::input::batchslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input10batchSlotsE", false]], "tensorrt_llm::runtime::decoder_batch::input::batchslotsrequestorder (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input22batchSlotsRequestOrderE", false]], "tensorrt_llm::runtime::decoder_batch::input::cacheindirection (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input16cacheIndirectionE", false]], "tensorrt_llm::runtime::decoder_batch::input::eagleinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input11eagleInputsE", false]], "tensorrt_llm::runtime::decoder_batch::input::eaglelastinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15eagleLastInputsE", false]], "tensorrt_llm::runtime::decoder_batch::input::explicitdrafttokensinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input25explicitDraftTokensInputsE", false]], "tensorrt_llm::runtime::decoder_batch::input::explicitdrafttokenslastinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input29explicitDraftTokensLastInputsE", false]], "tensorrt_llm::runtime::decoder_batch::input::generationsteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15generationStepsE", false]], "tensorrt_llm::runtime::decoder_batch::input::input (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorI14TensorConstPtrEE", false], [1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorINSt6vectorI14TensorConstPtrEEEE10SizeType32", false]], "tensorrt_llm::runtime::decoder_batch::input::logits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input6logitsE", false]], "tensorrt_llm::runtime::decoder_batch::input::maxdecodersteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15maxDecoderStepsE", false]], "tensorrt_llm::runtime::decoder_batch::input::predicteddraftlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input20predictedDraftLogitsE", false]], "tensorrt_llm::runtime::decoder_batch::input::tensorconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input14TensorConstPtrE", false]], "tensorrt_llm::runtime::decoder_batch::input::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input9TensorPtrE", false]], "tensorrt_llm::runtime::decoder_batch::output (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE", false]], "tensorrt_llm::runtime::decoder_batch::output::cacheindirection (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output16cacheIndirectionE", false]], "tensorrt_llm::runtime::decoder_batch::output::output (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output6OutputEv", false]], "tensorrt_llm::runtime::decoder_batch::output::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output9TensorPtrE", false]], "tensorrt_llm::runtime::decoder_batch::request (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7RequestE", false]], "tensorrt_llm::runtime::decoder_batch::request::badwordslist (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request12badWordsListE", false]], "tensorrt_llm::runtime::decoder_batch::request::bufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request9BufferPtrE", false]], "tensorrt_llm::runtime::decoder_batch::request::draftlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11draftLogitsE", false]], "tensorrt_llm::runtime::decoder_batch::request::drafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11draftTokensE", false]], "tensorrt_llm::runtime::decoder_batch::request::dtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request5dtypeE", false]], "tensorrt_llm::runtime::decoder_batch::request::eagleconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11eagleConfigE", false]], "tensorrt_llm::runtime::decoder_batch::request::embeddingbias (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13embeddingBiasE", false]], "tensorrt_llm::runtime::decoder_batch::request::endid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request5endIdE", false]], "tensorrt_llm::runtime::decoder_batch::request::generatedtokensperenginestep (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request28generatedTokensPerEngineStepE", false]], "tensorrt_llm::runtime::decoder_batch::request::ids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request3idsE", false]], "tensorrt_llm::runtime::decoder_batch::request::inputlen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request8inputLenE", false]], "tensorrt_llm::runtime::decoder_batch::request::lookaheadruntimeconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request22lookaheadRuntimeConfigE", false]], "tensorrt_llm::runtime::decoder_batch::request::maxnewtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request12maxNewTokensE", false]], "tensorrt_llm::runtime::decoder_batch::request::medusapaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11medusaPathsE", false]], "tensorrt_llm::runtime::decoder_batch::request::medusatreeids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13medusaTreeIdsE", false]], "tensorrt_llm::runtime::decoder_batch::request::request (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", false]], "tensorrt_llm::runtime::decoder_batch::request::stopwordslist (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13stopWordsListE", false]], "tensorrt_llm::runtime::decoder_batch::request::tensorconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request14TensorConstPtrE", false]], "tensorrt_llm::runtime::decoder_batch::request::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request9TensorPtrE", false]], "tensorrt_llm::runtime::decodinginput (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInputE", false]], "tensorrt_llm::runtime::decodinginput::badwordslens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12badWordsLensE", false]], "tensorrt_llm::runtime::decodinginput::badwordslists (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13badWordsListsE", false]], "tensorrt_llm::runtime::decodinginput::badwordsptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12badWordsPtrsE", false]], "tensorrt_llm::runtime::decodinginput::batchsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9batchSizeE", false]], "tensorrt_llm::runtime::decodinginput::batchslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput10batchSlotsE", false]], "tensorrt_llm::runtime::decodinginput::beamwidths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput10beamWidthsE", false]], "tensorrt_llm::runtime::decodinginput::cacheindirection (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput16cacheIndirectionE", false]], "tensorrt_llm::runtime::decodinginput::decodinginput (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11eagleInputsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::acceptedlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs12acceptedLensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::acceptedpathids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15acceptedPathIdsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::acceptedtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14acceptedTokensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::chunkedcontextnexttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs24chunkedContextNextTokensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::eagleinputs (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::lastdraftlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs13lastDraftLensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::lastdraftpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14lastDraftPathsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::lastdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15lastDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::nextdraftlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs13nextDraftLensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::nextdraftpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14nextDraftPathsE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::eagleinputs::seqslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs8seqSlotsE", false]], "tensorrt_llm::runtime::decodinginput::embeddingbias (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13embeddingBiasE", false]], "tensorrt_llm::runtime::decodinginput::endids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput6endIdsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25explicitDraftTokensInputsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::bestpathindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15bestPathIndicesE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::bestpathlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15bestPathLengthsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::lastdraftindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs16lastDraftIndicesE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::lastdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15lastDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::lastgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs21lastGenerationLengthsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::lastpositionidsbase (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs19lastPositionIdsBaseE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::masks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs5masksE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::maxgenlengthdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs18maxGenLengthDeviceE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextdraftindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs16nextDraftIndicesE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextdraftprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs14nextDraftProbsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextflattokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs14nextFlatTokensE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::nextgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs21nextGenerationLengthsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::packedpositionids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs17packedPositionIdsE", false]], "tensorrt_llm::runtime::decodinginput::explicitdrafttokensinputs::seqslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs8seqSlotsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25externalDraftTokensInputsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::constantthreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs17constantThresholdE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::draftlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs11draftLogitsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::draftprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs10draftProbsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::drafttokenids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs13draftTokenIdsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::numdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs14numDraftTokensE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::numdrafttokenshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs18numDraftTokensHostE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::step (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs4stepE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::targetprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs11targetProbsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::usedraftlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs14useDraftLogitsE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::usedraftlogitshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs18useDraftLogitsHostE", false]], "tensorrt_llm::runtime::decodinginput::externaldrafttokensinputs::userandomacceptancethreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs28useRandomAcceptanceThresholdE", false]], "tensorrt_llm::runtime::decodinginput::finishreasons (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13finishReasonsE", false]], "tensorrt_llm::runtime::decodinginput::generationsteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15generationStepsE", false]], "tensorrt_llm::runtime::decodinginput::lengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput7lengthsE", false]], "tensorrt_llm::runtime::decodinginput::logits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput6logitsE", false]], "tensorrt_llm::runtime::decodinginput::logitsvec (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9logitsVecE", false]], "tensorrt_llm::runtime::decodinginput::lookaheadinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15lookaheadInputsE", false]], "tensorrt_llm::runtime::decodinginput::lookaheadinputs (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15LookaheadInputsE", false]], "tensorrt_llm::runtime::decodinginput::lookaheadinputs::tokensperstep (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15LookaheadInputs13tokensPerStepE", false]], "tensorrt_llm::runtime::decodinginput::maxattentionwindow (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput18maxAttentionWindowE", false]], "tensorrt_llm::runtime::decodinginput::maxbadwordslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14maxBadWordsLenE", false]], "tensorrt_llm::runtime::decodinginput::maxlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9maxLengthE", false]], "tensorrt_llm::runtime::decodinginput::maxstopwordslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15maxStopWordsLenE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputsE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12medusaInputsE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusacurtokensperstep (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs22medusaCurTokensPerStepE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusalogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs12medusaLogitsE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusapaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs11medusaPathsE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusatargettokensperstep (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs25medusaTargetTokensPerStepE", false]], "tensorrt_llm::runtime::decodinginput::medusainputs::medusatreeids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs13medusaTreeIdsE", false]], "tensorrt_llm::runtime::decodinginput::norepeatngramsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput17noRepeatNgramSizeE", false]], "tensorrt_llm::runtime::decodinginput::sequencelimitlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput19sequenceLimitLengthE", false]], "tensorrt_llm::runtime::decodinginput::sinktokenlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15sinkTokenLengthE", false]], "tensorrt_llm::runtime::decodinginput::step (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput4stepE", false]], "tensorrt_llm::runtime::decodinginput::stopwordslens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13stopWordsLensE", false]], "tensorrt_llm::runtime::decodinginput::stopwordslists (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14stopWordsListsE", false]], "tensorrt_llm::runtime::decodinginput::stopwordsptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13stopWordsPtrsE", false]], "tensorrt_llm::runtime::decodinginput::tensorconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14TensorConstPtrE", false]], "tensorrt_llm::runtime::decodinginput::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9TensorPtrE", false]], "tensorrt_llm::runtime::decodingoutput (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutputE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypothesesE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14beamHypothesesE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::batchdones (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses10batchDonesE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::cumlogprobscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses14cumLogProbsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::empty (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5emptyERK13BufferManager", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::init (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses4initERK13BufferManager11TokenIdType", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::logprobscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses11logProbsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::minnormedscorescba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses18minNormedScoresCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::normedscorescba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses15normedScoresCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::numbeamscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses11numBeamsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::outputidscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses12outputIdsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::release (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7releaseEv", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::sequencelengthscba (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses18sequenceLengthsCBAE", false]], "tensorrt_llm::runtime::decodingoutput::beamhypotheses::slice (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5sliceE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::decodingoutput::cacheindirection (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput16cacheIndirectionE", false]], "tensorrt_llm::runtime::decodingoutput::cumlogprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11cumLogProbsE", false]], "tensorrt_llm::runtime::decodingoutput::decodingoutput (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14DecodingOutputE9TensorPtr9TensorPtr", false]], "tensorrt_llm::runtime::decodingoutput::eaglebuffers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput12eagleBuffersE", false]], "tensorrt_llm::runtime::decodingoutput::explicitdrafttokensbuffers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26explicitDraftTokensBuffersE", false]], "tensorrt_llm::runtime::decodingoutput::finishedsum (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11finishedSumE", false]], "tensorrt_llm::runtime::decodingoutput::finishreasons (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput13finishReasonsE", false]], "tensorrt_llm::runtime::decodingoutput::gatheredids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11gatheredIdsE", false]], "tensorrt_llm::runtime::decodingoutput::ids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput3idsE", false]], "tensorrt_llm::runtime::decodingoutput::knegativeinfinity (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput17kNegativeInfinityE", false]], "tensorrt_llm::runtime::decodingoutput::lengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput7lengthsE", false]], "tensorrt_llm::runtime::decodingoutput::logprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput8logProbsE", false]], "tensorrt_llm::runtime::decodingoutput::logprobstiled (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput13logProbsTiledE", false]], "tensorrt_llm::runtime::decodingoutput::lookaheadoutputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput16lookaheadOutputsE", false]], "tensorrt_llm::runtime::decodingoutput::newtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9newTokensE", false]], "tensorrt_llm::runtime::decodingoutput::newtokenssteps (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14newTokensStepsE", false]], "tensorrt_llm::runtime::decodingoutput::newtokensvec (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput12newTokensVecE", false]], "tensorrt_llm::runtime::decodingoutput::parentids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9parentIdsE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputsE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26speculativeDecodingOutputsE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::acceptedlengthscumsum (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs21acceptedLengthsCumSumE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::acceptedtokenslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs17acceptedTokensLenE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::nextdrafttokenslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs18nextDraftTokensLenE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::pathsoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs12pathsOffsetsE", false]], "tensorrt_llm::runtime::decodingoutput::speculativedecodingoutputs::prevdrafttokenslen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs18prevDraftTokensLenE", false]], "tensorrt_llm::runtime::decodingoutput::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9TensorPtrE", false]], "tensorrt_llm::runtime::deviceallocationnvls (c++ class)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime20DeviceAllocationNvlsE", false]], "tensorrt_llm::runtime::deviceallocationnvls::_capacity (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls9_capacityE", false]], "tensorrt_llm::runtime::deviceallocationnvls::_handle (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls7_handleE", false]], "tensorrt_llm::runtime::deviceallocationnvls::deviceallocationnvls (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls20DeviceAllocationNvlsEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::free (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls4freeEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::getcapacity (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls11getCapacityEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::getipcunicastpointers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls21getIpcUnicastPointersEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::getmulticastpointer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls19getMulticastPointerEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::getunicastpointer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls17getUnicastPointerEv", false]], "tensorrt_llm::runtime::deviceallocationnvls::reset (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls5resetE6size_tNSt3setIiEE", false]], "tensorrt_llm::runtime::deviceallocationnvls::~deviceallocationnvls (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvlsD0Ev", false]], "tensorrt_llm::runtime::eaglebuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffersE", false]], "tensorrt_llm::runtime::eaglebuffers::bufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9BufferPtrE", false]], "tensorrt_llm::runtime::eaglebuffers::chunkedcontextnexttokenshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers28chunkedContextNextTokensHostE", false]], "tensorrt_llm::runtime::eaglebuffers::cumsumgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers23cumSumGenerationLengthsE", false]], "tensorrt_llm::runtime::eaglebuffers::eaglebuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", false]], "tensorrt_llm::runtime::eaglebuffers::engineinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12engineInputsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13engineOutputsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::acceptedlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs12acceptedLensE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::acceptedpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs13acceptedPathsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::acceptedtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs14acceptedTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::chunkedcontextnexttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs24chunkedContextNextTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::nextdraftlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs13nextDraftLensE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::nextdraftpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs14nextDraftPathsE", false]], "tensorrt_llm::runtime::eaglebuffers::engineoutputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::greedysamplinghost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers18greedySamplingHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6InputsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::alllayersdrafttokenids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs22allLayersDraftTokenIdsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::alllayersdrafttokenidspredecessor (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs33allLayersDraftTokenIdsPredecessorE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::alllayersscores (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs15allLayersScoresE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::chunkedcontextnexttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs24chunkedContextNextTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::create (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::currentexpandindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs20currentExpandIndicesE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::draftlens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs9draftLensE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::draftpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs10draftPathsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::draftpathshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs14draftPathsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::drafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs11draftTokensE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::dynamictreemaxtopkhost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs22dynamicTreeMaxTopKHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetctxcontextlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29eagleNetCtxContextLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetctxpastkeyvaluelengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs34eagleNetCtxPastKeyValueLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetctxrequesttypeshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27eagleNetCtxRequestTypesHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetgencontextlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29eagleNetGenContextLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetgenpastkeyvaluelengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs34eagleNetGenPastKeyValueLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::eaglenetgenrequesttypeshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27eagleNetGenRequestTypesHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::inputgentokenshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18inputGenTokensHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::posterioralpha (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs14posteriorAlphaE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::posteriorthreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18posteriorThresholdE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::prevscores (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs10prevScoresE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::randomdatasample (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs16randomDataSampleE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::randomdatavalidation (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs20randomDataValidationE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::specdecodinggenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29specDecodingGenerationLengthsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::specdecodinggenerationlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs33specDecodingGenerationLengthsHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::specdecodingpackedmasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs23specDecodingPackedMasksE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::specdecodingpositionoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27specDecodingPositionOffsetsE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::temperatures (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs12temperaturesE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::usedynamictreehost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18useDynamicTreeHostE", false]], "tensorrt_llm::runtime::eaglebuffers::inputs::usespecdecoding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs15useSpecDecodingE", false]], "tensorrt_llm::runtime::eaglebuffers::insertinputtensors (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::eaglebuffers::itensor (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7ITensorE", false]], "tensorrt_llm::runtime::eaglebuffers::llmrequestptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13LlmRequestPtrE", false]], "tensorrt_llm::runtime::eaglebuffers::maxgenerationlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers19maxGenerationLengthE", false]], "tensorrt_llm::runtime::eaglebuffers::mdefaultposteriorthreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers26mDefaultPosteriorThresholdE", false]], "tensorrt_llm::runtime::eaglebuffers::mdogreedysampling (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers17mDoGreedySamplingE", false]], "tensorrt_llm::runtime::eaglebuffers::posterioralphahost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers18posteriorAlphaHostE", false]], "tensorrt_llm::runtime::eaglebuffers::posteriorthresholdhost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers22posteriorThresholdHostE", false]], "tensorrt_llm::runtime::eaglebuffers::requestvector (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13RequestVectorE", false]], "tensorrt_llm::runtime::eaglebuffers::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", false]], "tensorrt_llm::runtime::eaglebuffers::scanreducetempstorage (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers21scanReduceTempStorageE", false]], "tensorrt_llm::runtime::eaglebuffers::scanreducetempstoragebytes (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers26scanReduceTempStorageBytesE", false]], "tensorrt_llm::runtime::eaglebuffers::setfrominputs (c++ function)": [[1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", false], [1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::eaglebuffers::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers10SizeType32E", false]], "tensorrt_llm::runtime::eaglebuffers::tensormap (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9TensorMapE", false]], "tensorrt_llm::runtime::eaglebuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::eaglemodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModuleE", false]], "tensorrt_llm::runtime::eaglemodule::eaglemodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleEv", false]], "tensorrt_llm::runtime::eaglemodule::getdefaulteaglechoices (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule22getDefaultEagleChoicesEv", false]], "tensorrt_llm::runtime::eaglemodule::getmaxnonleafnodesperlayer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule26getMaxNonLeafNodesPerLayerEv", false]], "tensorrt_llm::runtime::eaglemodule::getnumtransformerlayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule23getNumTransformerLayersEv", false]], "tensorrt_llm::runtime::eaglemodule::mdefaulteaglechoices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModule20mDefaultEagleChoicesE", false]], "tensorrt_llm::runtime::eaglemodule::mmaxnonleafnodesperlayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModule24mMaxNonLeafNodesPerLayerE", false]], "tensorrt_llm::runtime::eaglemodule::mnumtransformerslayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11EagleModule21mNumTransformersLayerE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffersE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::bufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9BufferPtrE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::cumsumgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers23cumSumGenerationLengthsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineinputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineinputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12engineInputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineinputs::positionoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputs15positionOffsetsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineinputs::requesttypesdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputs18requestTypesDeviceE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13engineOutputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::bestpathindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15bestPathIndicesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::bestpathlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15bestPathLengthsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::masks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs5masksE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::maxgentoken (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs11maxGenTokenE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextdraftindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs16nextDraftIndicesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextdraftprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs14nextDraftProbsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15nextDraftTokensE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextflattokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs14nextFlatTokensE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextgenerationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs21nextGenerationLengthsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::nextpositionoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs19nextPositionOffsetsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::packedpositionids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs17packedPositionIdsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::engineoutputs::totalgentoken (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs13totalGenTokenE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::explicitdrafttokensbuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6InputsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::create (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::draftindices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs12draftIndicesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::draftprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs10draftProbsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::drafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11draftTokensE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::generationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs17generationLengthsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::generationlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs21generationLengthsHostE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::maxgenlengthhost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs16maxGenLengthHostE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::packedmasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11packedMasksE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::positionids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11positionIdsE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::positionidsbase (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs15positionIdsBaseE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::randomdatasample (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs16randomDataSampleE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::randomdatavalidation (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs20randomDataValidationE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::temperatures (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs12temperaturesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::inputs::usespecdecoding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs15useSpecDecodingE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::insertinputtensors (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::itensor (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7ITensorE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::scantempstorage (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers15scanTempStorageE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::scantempstoragebytes (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers20scanTempStorageBytesE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::setfrominputs (c++ function)": [[1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", false], [1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers10SizeType32E", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::tensormap (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9TensorMapE", false]], "tensorrt_llm::runtime::explicitdrafttokensbuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::genericprompttuningparams (c++ class)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime25GenericPromptTuningParamsE", false]], "tensorrt_llm::runtime::genericprompttuningparams::embeddingtable (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams14embeddingTableE", false]], "tensorrt_llm::runtime::genericprompttuningparams::genericprompttuningparams (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", false]], "tensorrt_llm::runtime::genericprompttuningparams::prompttuningenabled (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams19promptTuningEnabledE", false]], "tensorrt_llm::runtime::genericprompttuningparams::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams10SizeType32E", false]], "tensorrt_llm::runtime::genericprompttuningparams::tasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams5tasksE", false]], "tensorrt_llm::runtime::genericprompttuningparams::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams9TensorPtrE", false]], "tensorrt_llm::runtime::genericprompttuningparams::vocabsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams9vocabSizeE", false]], "tensorrt_llm::runtime::getdefaultbatchslots (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20getDefaultBatchSlotsEN7runtime10SizeType32E", false]], "tensorrt_llm::runtime::gptdecoder (c++ class)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime10GptDecoderE", false]], "tensorrt_llm::runtime::gptdecoder::cudastreamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13CudaStreamPtrE", false]], "tensorrt_llm::runtime::gptdecoder::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", false]], "tensorrt_llm::runtime::gptdecoder::forwardasync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", false]], "tensorrt_llm::runtime::gptdecoder::forwardsync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", false]], "tensorrt_llm::runtime::gptdecoder::getsamplingconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder17getSamplingConfigEv", false]], "tensorrt_llm::runtime::gptdecoder::gptdecoder (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", false]], "tensorrt_llm::runtime::gptdecoder::mdecodinglayerworkspace (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder23mDecodingLayerWorkspaceE", false]], "tensorrt_llm::runtime::gptdecoder::mdecodingmode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13mDecodingModeE", false]], "tensorrt_llm::runtime::gptdecoder::mdynamicdecodelayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder19mDynamicDecodeLayerE", false]], "tensorrt_llm::runtime::gptdecoder::mmanager (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder8mManagerE", false]], "tensorrt_llm::runtime::gptdecoder::mmaxbatchsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13mMaxBatchSizeE", false]], "tensorrt_llm::runtime::gptdecoder::msamplingconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder15mSamplingConfigE", false]], "tensorrt_llm::runtime::gptdecoder::mvocabsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10mVocabSizeE", false]], "tensorrt_llm::runtime::gptdecoder::mvocabsizepadded (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16mVocabSizePaddedE", false]], "tensorrt_llm::runtime::gptdecoder::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", false]], "tensorrt_llm::runtime::gptdecoder::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder9TensorPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatchedE", false]], "tensorrt_llm::runtime::gptdecoderbatched::cudastreamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13CudaStreamPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", false]], "tensorrt_llm::runtime::gptdecoderbatched::finalize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", false]], "tensorrt_llm::runtime::gptdecoderbatched::forward (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::forwardasync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::forwarddispatch (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::getbuffermanager (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getBufferManagerEv", false]], "tensorrt_llm::runtime::gptdecoderbatched::getdecoderstate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv", false], [1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv", false]], "tensorrt_llm::runtime::gptdecoderbatched::getdecoderstream (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getDecoderStreamEv", false]], "tensorrt_llm::runtime::gptdecoderbatched::getunderlyingdecoder (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched20getUnderlyingDecoderEv", false]], "tensorrt_llm::runtime::gptdecoderbatched::gptdecoderbatched (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::gptdecoderbatched::gptdecoderptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13GptDecoderPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched::llmrequestptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13LlmRequestPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mbuffermanager (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mBufferManagerE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mdecoder (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched8mDecoderE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mdecoderstate (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13mDecoderStateE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mdecoderstream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mDecoderStreamE", false]], "tensorrt_llm::runtime::gptdecoderbatched::mruntimestream (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mRuntimeStreamE", false]], "tensorrt_llm::runtime::gptdecoderbatched::prepareforward (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::requestvector (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13RequestVectorE", false]], "tensorrt_llm::runtime::gptdecoderbatched::seteagleinputs (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::setexplicitdrafttokensinputs (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::gptdecoderbatched::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", false]], "tensorrt_llm::runtime::gptdecoderbatched::sharedconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14SharedConstPtrE", false]], "tensorrt_llm::runtime::gptdecoderbatched::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched9TensorPtrE", false]], "tensorrt_llm::runtime::gptjsonconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfigE", false]], "tensorrt_llm::runtime::gptjsonconfig::enginefilename (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfig", false], [1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfigRKNSt6stringE", false]], "tensorrt_llm::runtime::gptjsonconfig::getcontextparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig21getContextParallelismEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getgpuspernode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14getGpusPerNodeEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getmodelconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14getModelConfigEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getmodelconfigmutable (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig21getModelConfigMutableEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getname (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig7getNameEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getpipelineparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig22getPipelineParallelismEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getprecision (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig12getPrecisionEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getruntimedefaults (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig18getRuntimeDefaultsEv", false]], "tensorrt_llm::runtime::gptjsonconfig::gettensorparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig20getTensorParallelismEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getversion (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig10getVersionEv", false]], "tensorrt_llm::runtime::gptjsonconfig::getworldsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig12getWorldSizeEv", false]], "tensorrt_llm::runtime::gptjsonconfig::gptjsonconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", false]], "tensorrt_llm::runtime::gptjsonconfig::mcontextparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig19mContextParallelismE", false]], "tensorrt_llm::runtime::gptjsonconfig::mgpuspernode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig12mGpusPerNodeE", false]], "tensorrt_llm::runtime::gptjsonconfig::mmodelconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig12mModelConfigE", false]], "tensorrt_llm::runtime::gptjsonconfig::mname (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5mNameE", false]], "tensorrt_llm::runtime::gptjsonconfig::mpipelineparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig20mPipelineParallelismE", false]], "tensorrt_llm::runtime::gptjsonconfig::mprecision (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig10mPrecisionE", false]], "tensorrt_llm::runtime::gptjsonconfig::mruntimedefaults (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig16mRuntimeDefaultsE", false]], "tensorrt_llm::runtime::gptjsonconfig::mtensorparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig18mTensorParallelismE", false]], "tensorrt_llm::runtime::gptjsonconfig::mversion (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig8mVersionE", false]], "tensorrt_llm::runtime::gptjsonconfig::parse (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt10filesystem4pathE", false], [1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt6stringE", false], [1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERNSt7istreamE", false]], "tensorrt_llm::runtime::ibuffer (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBufferE", false]], "tensorrt_llm::runtime::ibuffer::data (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataEv", false], [1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", false], [1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataEv", false]], "tensorrt_llm::runtime::ibuffer::datatype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer8DataTypeE", false]], "tensorrt_llm::runtime::ibuffer::getcapacity (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer11getCapacityEv", false]], "tensorrt_llm::runtime::ibuffer::getdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer11getDataTypeEv", false]], "tensorrt_llm::runtime::ibuffer::getdatatypename (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer15getDataTypeNameE8DataType", false], [1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer15getDataTypeNameEv", false]], "tensorrt_llm::runtime::ibuffer::getmemorytype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer13getMemoryTypeEv", false]], "tensorrt_llm::runtime::ibuffer::getmemorytypename (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer17getMemoryTypeNameEv", false]], "tensorrt_llm::runtime::ibuffer::getsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7getSizeEv", false]], "tensorrt_llm::runtime::ibuffer::getsizeinbytes (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer14getSizeInBytesEv", false]], "tensorrt_llm::runtime::ibuffer::ibuffer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7IBufferERK7IBuffer", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7IBufferEv", false]], "tensorrt_llm::runtime::ibuffer::memorytype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer10memoryTypeEPKv", false]], "tensorrt_llm::runtime::ibuffer::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBufferaSERK7IBuffer", false]], "tensorrt_llm::runtime::ibuffer::release (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7releaseEv", false]], "tensorrt_llm::runtime::ibuffer::resize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer6resizeENSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::sharedconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer14SharedConstPtrE", false]], "tensorrt_llm::runtime::ibuffer::sharedptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer9SharedPtrE", false]], "tensorrt_llm::runtime::ibuffer::slice (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::tobytes (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7toBytesENSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::uniqueconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer14UniqueConstPtrE", false]], "tensorrt_llm::runtime::ibuffer::uniqueptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBuffer9UniquePtrE", false]], "tensorrt_llm::runtime::ibuffer::view (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtr", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtrNSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::wrap (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrRNSt6vectorI1TEE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", false]], "tensorrt_llm::runtime::ibuffer::~ibuffer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7IBufferD0Ev", false]], "tensorrt_llm::runtime::igptdecoder (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoderE", false]], "tensorrt_llm::runtime::igptdecoder::create (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", false]], "tensorrt_llm::runtime::igptdecoder::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", false]], "tensorrt_llm::runtime::igptdecoder::forwardasync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", false]], "tensorrt_llm::runtime::igptdecoder::forwardsync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", false]], "tensorrt_llm::runtime::igptdecoder::getsamplingconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder17getSamplingConfigEv", false]], "tensorrt_llm::runtime::igptdecoder::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", false]], "tensorrt_llm::runtime::igptdecoder::tensorconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder14TensorConstPtrE", false]], "tensorrt_llm::runtime::igptdecoder::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder9TensorPtrE", false]], "tensorrt_llm::runtime::igptdecoder::~igptdecoder (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoderD0Ev", false]], "tensorrt_llm::runtime::igptdecoderbatched (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatchedE", false]], "tensorrt_llm::runtime::igptdecoderbatched::cudastreamptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13CudaStreamPtrE", false]], "tensorrt_llm::runtime::igptdecoderbatched::disablelookahead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", false]], "tensorrt_llm::runtime::igptdecoderbatched::finalize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", false]], "tensorrt_llm::runtime::igptdecoderbatched::forward (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::igptdecoderbatched::forwardasync (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", false]], "tensorrt_llm::runtime::igptdecoderbatched::igptdecoderbatched (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched18IGptDecoderBatchedEv", false]], "tensorrt_llm::runtime::igptdecoderbatched::llmrequestptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13LlmRequestPtrE", false]], "tensorrt_llm::runtime::igptdecoderbatched::requestvector (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13RequestVectorE", false]], "tensorrt_llm::runtime::igptdecoderbatched::setup (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", false]], "tensorrt_llm::runtime::igptdecoderbatched::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched9TensorPtrE", false]], "tensorrt_llm::runtime::igptdecoderbatched::~igptdecoderbatched (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatchedD0Ev", false]], "tensorrt_llm::runtime::ipcmemory (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryE", false]], "tensorrt_llm::runtime::ipcmemory::allocateipcmemory (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", false]], "tensorrt_llm::runtime::ipcmemory::bufferptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9BufferPtrE", false]], "tensorrt_llm::runtime::ipcmemory::destroyipcmemory (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory16destroyIpcMemoryEv", false]], "tensorrt_llm::runtime::ipcmemory::flags_size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory10FLAGS_SIZEE", false]], "tensorrt_llm::runtime::ipcmemory::getcommptrs (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9IpcMemory11getCommPtrsEv", false]], "tensorrt_llm::runtime::ipcmemory::ipcmemory (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", false], [1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryERK9IpcMemory", false], [1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryERR9IpcMemory", false]], "tensorrt_llm::runtime::ipcmemory::mbuffer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory7mBufferE", false]], "tensorrt_llm::runtime::ipcmemory::mcommptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9mCommPtrsE", false]], "tensorrt_llm::runtime::ipcmemory::mopenipc (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory8mOpenIpcE", false]], "tensorrt_llm::runtime::ipcmemory::mtprank (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory7mTpRankE", false]], "tensorrt_llm::runtime::ipcmemory::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryaSERK9IpcMemory", false], [1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryaSERR9IpcMemory", false]], "tensorrt_llm::runtime::ipcmemory::~ipcmemory (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryD0Ev", false]], "tensorrt_llm::runtime::ipcnvlsallocate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime15ipcNvlsAllocateE6size_tNSt3setIiEE", false]], "tensorrt_llm::runtime::ipcnvlsfree (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ipcNvlsFreeEP13IpcNvlsHandle", false]], "tensorrt_llm::runtime::ipcnvlshandle (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandleE", false]], "tensorrt_llm::runtime::ipcnvlshandle::ipc_uc_handles (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle14ipc_uc_handlesE", false]], "tensorrt_llm::runtime::ipcnvlshandle::ipc_uc_ptrs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle11ipc_uc_ptrsE", false]], "tensorrt_llm::runtime::ipcnvlshandle::ipc_uc_vas (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle10ipc_uc_vasE", false]], "tensorrt_llm::runtime::ipcnvlshandle::mc_handle (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle9mc_handleE", false]], "tensorrt_llm::runtime::ipcnvlshandle::mc_ptr (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle6mc_ptrE", false]], "tensorrt_llm::runtime::ipcnvlshandle::mc_va (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle5mc_vaE", false]], "tensorrt_llm::runtime::ipcnvlshandle::size (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle4sizeE", false]], "tensorrt_llm::runtime::ipcnvlshandle::uc_handle (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle9uc_handleE", false]], "tensorrt_llm::runtime::ipcnvlshandle::uc_ptr (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle6uc_ptrE", false]], "tensorrt_llm::runtime::ipcnvlshandle::uc_va (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle5uc_vaE", false]], "tensorrt_llm::runtime::ipcnvlssupported (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime16ipcNvlsSupportedEv", false]], "tensorrt_llm::runtime::itensor (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensorE", false]], "tensorrt_llm::runtime::itensor::at (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRKNSt16initializer_listI9DimType64EE", false]], "tensorrt_llm::runtime::itensor::castsize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor8castSizeE6size_t", false]], "tensorrt_llm::runtime::itensor::dimtype64 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9DimType64E", false]], "tensorrt_llm::runtime::itensor::flattenn (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor8flattenNE9SharedPtrNSt7int64_tE", false]], "tensorrt_llm::runtime::itensor::getdimension (c++ function)": [[1, "_CPPv4I_10SizeType32ENK12tensorrt_llm7runtime7ITensor12getDimensionE9DimType64v", false]], "tensorrt_llm::runtime::itensor::getshape (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime7ITensor8getShapeEv", false]], "tensorrt_llm::runtime::itensor::itensor (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor7ITensorERK7ITensor", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor7ITensorEv", false]], "tensorrt_llm::runtime::itensor::makeshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9makeShapeERKNSt16initializer_listI9DimType64EE", false]], "tensorrt_llm::runtime::itensor::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensoraSERK7ITensor", false]], "tensorrt_llm::runtime::itensor::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor7reshapeERK5Shape", false]], "tensorrt_llm::runtime::itensor::resize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor6resizeENSt6size_tE", false]], "tensorrt_llm::runtime::itensor::shape (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor5ShapeE", false]], "tensorrt_llm::runtime::itensor::shapeequals (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", false], [1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor11shapeEqualsERK5ShapeRK5Shape", false], [1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERK5Shape", false], [1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERKNSt16initializer_listI10SizeType32EE", false]], "tensorrt_llm::runtime::itensor::sharedconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor14SharedConstPtrE", false]], "tensorrt_llm::runtime::itensor::sharedptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9SharedPtrE", false]], "tensorrt_llm::runtime::itensor::slice (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", false], [1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", false]], "tensorrt_llm::runtime::itensor::squeeze (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeE10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeERK5Shape10SizeType32", false]], "tensorrt_llm::runtime::itensor::strides (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor7stridesERK5Shape", false]], "tensorrt_llm::runtime::itensor::tensormap (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9TensorMapE", false]], "tensorrt_llm::runtime::itensor::tostring (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor8toStringERK5Shape", false]], "tensorrt_llm::runtime::itensor::uniqueconstptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor14UniqueConstPtrE", false]], "tensorrt_llm::runtime::itensor::uniqueptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9UniquePtrE", false]], "tensorrt_llm::runtime::itensor::unsqueeze (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeE10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeERK5Shape10SizeType32", false]], "tensorrt_llm::runtime::itensor::view (c++ function)": [[1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewE9SharedPtr", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewEN7IBuffer9SharedPtrERK5Shape", false]], "tensorrt_llm::runtime::itensor::volume (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor6volumeERK5Shape", false]], "tensorrt_llm::runtime::itensor::volumenonnegative (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensor17volumeNonNegativeERK5Shape", false]], "tensorrt_llm::runtime::itensor::wrap (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", false], [1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", false], [1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", false]], "tensorrt_llm::runtime::itensor::~itensor (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime7ITensorD0Ev", false]], "tensorrt_llm::runtime::lamportinitializeall (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffersE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::generationlengths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers17generationLengthsE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::lookaheaddecodingbuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::packedmasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers11packedMasksE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::positionids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers11positionIdsE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::positionoffsets (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers15positionOffsetsE", false]], "tensorrt_llm::runtime::lookaheaddecodingbuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::lookaheadmodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModuleE", false]], "tensorrt_llm::runtime::lookaheadmodule::getexecutionconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime15LookaheadModule18getExecutionConfigEv", false]], "tensorrt_llm::runtime::lookaheadmodule::lookaheadmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleE10SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleEv", false]], "tensorrt_llm::runtime::lookaheadmodule::mexecutionconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule16mExecutionConfigE", false]], "tensorrt_llm::runtime::lookaheadmodule::setexecutionconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule18setExecutionConfigERKN8executor23LookaheadDecodingConfigE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffersE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::batchslotshostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers18batchSlotsHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::cumsumlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers12cumSumLengthE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::disablelookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers24disableLookaheadDecodingEv", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::enablelookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23enableLookaheadDecodingE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::generationlengthsdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23generationLengthsDeviceE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::generationlengthshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers21generationLengthsHostE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::generationlengthshostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers25generationLengthsHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::insertinputtensors (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::lookaheadruntimebuffers (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::packedmaskhost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers14packedMaskHostE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::packedmaskhostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers18packedMaskHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::packedmasksdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers17packedMasksDeviceE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionidsdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers17positionIdsDeviceE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionidshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers15positionIdsHostE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionidshostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers19positionIdsHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionoffsetsdevice (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers21positionOffsetsDeviceE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionoffsetshost (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers19positionOffsetsHostE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::positionoffsetshostcopy (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23positionOffsetsHostCopyE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::reshape (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::setfrominputs (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::tensormap (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers9TensorMapE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers9TensorPtrE", false]], "tensorrt_llm::runtime::lookaheadruntimebuffers::usespecdecoding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers15useSpecDecodingE", false]], "tensorrt_llm::runtime::loracache (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCacheE", false]], "tensorrt_llm::runtime::loracache::bump (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache4bumpE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::bumptaskinprogress (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18bumpTaskInProgressE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::claimpageswithevict (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache19claimPagesWithEvictE10SizeType32", false]], "tensorrt_llm::runtime::loracache::copytask (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", false]], "tensorrt_llm::runtime::loracache::copytaskmappages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", false]], "tensorrt_llm::runtime::loracache::copytopages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", false]], "tensorrt_llm::runtime::loracache::determinenumpages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE10TaskIdType", false], [1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE9TensorPtr", false]], "tensorrt_llm::runtime::loracache::fits (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache4fitsE9TensorPtr", false]], "tensorrt_llm::runtime::loracache::get (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3getE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::getnumpages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache11getNumPagesEv", false]], "tensorrt_llm::runtime::loracache::getpageptr (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache10getPagePtrE6size_t", false]], "tensorrt_llm::runtime::loracache::getstatus (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache9getStatusE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::has (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache3hasE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::isdone (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache6isDoneE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::isloaded (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache8isLoadedE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::loadweights (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", false], [1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", false]], "tensorrt_llm::runtime::loracache::loracache (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", false]], "tensorrt_llm::runtime::loracache::markalldone (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11markAllDoneEv", false]], "tensorrt_llm::runtime::loracache::marktaskdone (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12markTaskDoneE10TaskIdType", false]], "tensorrt_llm::runtime::loracache::mbuffermanager (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache14mBufferManagerE", false]], "tensorrt_llm::runtime::loracache::mcachemap (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9mCacheMapE", false]], "tensorrt_llm::runtime::loracache::mcachemutex (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11mCacheMutexE", false]], "tensorrt_llm::runtime::loracache::mcachepagemanager (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17mCachePageManagerE", false]], "tensorrt_llm::runtime::loracache::mdevicebuffermanagers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21mDeviceBufferManagersE", false]], "tensorrt_llm::runtime::loracache::mdonetasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache10mDoneTasksE", false]], "tensorrt_llm::runtime::loracache::minprogresstasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16mInProgressTasksE", false]], "tensorrt_llm::runtime::loracache::mmodelconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12mModelConfigE", false]], "tensorrt_llm::runtime::loracache::mmoduleidtomodule (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17mModuleIdToModuleE", false]], "tensorrt_llm::runtime::loracache::mpagemanagerconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18mPageManagerConfigE", false]], "tensorrt_llm::runtime::loracache::mpagesmutex (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11mPagesMutexE", false]], "tensorrt_llm::runtime::loracache::mworldconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12mWorldConfigE", false]], "tensorrt_llm::runtime::loracache::put (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", false]], "tensorrt_llm::runtime::loracache::splittransposecpu (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loracache::splittransposecpuinner (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loracache::taskidtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache10TaskIdTypeE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::adaptersize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig11adapterSizeE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::insize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig6inSizeE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::layerid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7layerIdE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::moduleid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8moduleIdE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::numslots (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8numSlotsE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::operator== (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigeqERKN9LoraCache21TaskLayerModuleConfigE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::outsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7outSizeE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::pageid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig6pageIdE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::scalingvecpointer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig17scalingVecPointerE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::slotidx (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7slotIdxE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::tostring (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8toStringEv", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::weightsinpointer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig16weightsInPointerE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfig::weightsoutpointer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig17weightsOutPointerE", false]], "tensorrt_llm::runtime::loracache::tasklayermoduleconfiglistptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache28TaskLayerModuleConfigListPtrE", false]], "tensorrt_llm::runtime::loracache::taskvalue (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueE", false]], "tensorrt_llm::runtime::loracache::taskvalue::configs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue7configsE", false]], "tensorrt_llm::runtime::loracache::taskvalue::done (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue4doneE", false]], "tensorrt_llm::runtime::loracache::taskvalue::inprogress (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue10inProgressE", false]], "tensorrt_llm::runtime::loracache::taskvalue::it (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue2itE", false]], "tensorrt_llm::runtime::loracache::taskvalue::loaded (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue6loadedE", false]], "tensorrt_llm::runtime::loracache::taskvalue::loadinprogress (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue14loadInProgressE", false]], "tensorrt_llm::runtime::loracache::taskvalue::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueaSERR9TaskValue", false]], "tensorrt_llm::runtime::loracache::taskvalue::pageids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue7pageIdsE", false]], "tensorrt_llm::runtime::loracache::taskvalue::taskvalue (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", false], [1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERR9TaskValue", false], [1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueEv", false]], "tensorrt_llm::runtime::loracache::taskvalue::~taskvalue (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueD0Ev", false]], "tensorrt_llm::runtime::loracache::taskvalueptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12TaskValuePtrE", false]], "tensorrt_llm::runtime::loracache::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TensorPtrE", false]], "tensorrt_llm::runtime::loracache::valuestatus (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatusE", false]], "tensorrt_llm::runtime::loracache::valuestatus::kvalue_status_loaded (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus20kVALUE_STATUS_LOADEDE", false]], "tensorrt_llm::runtime::loracache::valuestatus::kvalue_status_missing (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus21kVALUE_STATUS_MISSINGE", false]], "tensorrt_llm::runtime::loracache::valuestatus::kvalue_status_processing (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus24kVALUE_STATUS_PROCESSINGE", false]], "tensorrt_llm::runtime::loracachefullexception (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullExceptionE", false]], "tensorrt_llm::runtime::loracachefullexception::loracachefullexception (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullException22LoraCacheFullExceptionERKNSt6stringE", false]], "tensorrt_llm::runtime::loracachefullexception::~loracachefullexception (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullExceptionD0Ev", false]], "tensorrt_llm::runtime::loracachepagemanager (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManagerE", false]], "tensorrt_llm::runtime::loracachepagemanager::blockptr (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager8blockPtrE10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanager::claimpages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10claimPagesE10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanager::initialize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10initializeERK13BufferManager", false]], "tensorrt_llm::runtime::loracachepagemanager::loracachepagemanager (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager20LoraCachePageManagerERK26LoraCachePageManagerConfigRK13BufferManager", false]], "tensorrt_llm::runtime::loracachepagemanager::mconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager7mConfigE", false]], "tensorrt_llm::runtime::loracachepagemanager::mfreepageids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12mFreePageIdsE", false]], "tensorrt_llm::runtime::loracachepagemanager::mispagefree (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager11mIsPageFreeE", false]], "tensorrt_llm::runtime::loracachepagemanager::mpageblocks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager11mPageBlocksE", false]], "tensorrt_llm::runtime::loracachepagemanager::mutablepageptr (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager14mutablePagePtrENSt6size_tE", false]], "tensorrt_llm::runtime::loracachepagemanager::numavailablepages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager17numAvailablePagesEv", false]], "tensorrt_llm::runtime::loracachepagemanager::pageptr (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager7pagePtrENSt6size_tE", false]], "tensorrt_llm::runtime::loracachepagemanager::releasepages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12releasePagesERKNSt6vectorINSt6size_tEEE", false]], "tensorrt_llm::runtime::loracachepagemanager::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager9TensorPtrE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfigE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig11getDataTypeEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getinittozero (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig13getInitToZeroEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getmaxpagesperblock (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig19getMaxPagesPerBlockEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getmemorytype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig13getMemoryTypeEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getnumcopystreams (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig17getNumCopyStreamsEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getpagewidth (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig12getPageWidthEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::getslotsperpage (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig15getSlotsPerPageEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::gettotalnumpages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig16getTotalNumPagesEv", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::loracachepagemanagerconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mdatatype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig9mDataTypeE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::minittozero (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11mInitToZeroE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mmaxpagesperblock (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17mMaxPagesPerBlockE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mmemorytype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11mMemoryTypeE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mnumcopystreams (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15mNumCopyStreamsE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mpagewidth (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig10mPageWidthE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mslotsperpage (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13mSlotsPerPageE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::mtotalnumpages (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig14mTotalNumPagesE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setdatatype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11setDataTypeERKN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setinittozero (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setInitToZeroEb", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setmaxpagesperblock (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig19setMaxPagesPerBlockERK10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setmemorytype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setMemoryTypeERKN7runtime10MemoryTypeE", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setnumcopystreams (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17setNumCopyStreamsE10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setpagewidth (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig12setPageWidthERK10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::setslotsperpage (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setSlotsPerPageERK10SizeType32", false]], "tensorrt_llm::runtime::loracachepagemanagerconfig::settotalnumpage (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setTotalNumPageERK10SizeType32", false]], "tensorrt_llm::runtime::loraexpectedexception (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedExceptionE", false]], "tensorrt_llm::runtime::loraexpectedexception::loraexpectedexception (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedException21LoraExpectedExceptionERKNSt6stringE", false]], "tensorrt_llm::runtime::loraexpectedexception::~loraexpectedexception (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedExceptionD0Ev", false]], "tensorrt_llm::runtime::loramodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleE", false]], "tensorrt_llm::runtime::loramodule::createloramodules (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::flattenedinoutsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18flattenedInOutSizeE10SizeType32b", false]], "tensorrt_llm::runtime::loramodule::indim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule5inDimEv", false]], "tensorrt_llm::runtime::loramodule::indimfirst (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10inDimFirstEv", false]], "tensorrt_llm::runtime::loramodule::insize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6inSizeE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::intpsplitdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12inTpSplitDimEv", false]], "tensorrt_llm::runtime::loramodule::localinadaptersize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18localInAdapterSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localindim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10localInDimE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::localinoutsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localInOutSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localinsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localInSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localoutadaptersize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule19localOutAdapterSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localoutdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localOutDimE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::localoutsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12localOutSizeE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::loramodule::localscalessize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule15localScalesSizeE10SizeType32b", false]], "tensorrt_llm::runtime::loramodule::localtotalsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", false]], "tensorrt_llm::runtime::loramodule::loramodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10LoraModule", false], [1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleEv", false]], "tensorrt_llm::runtime::loramodule::mindim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule6mInDimE", false]], "tensorrt_llm::runtime::loramodule::mindimfirst (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule11mInDimFirstE", false]], "tensorrt_llm::runtime::loramodule::mintpsplitdim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule13mInTpSplitDimE", false]], "tensorrt_llm::runtime::loramodule::moduletype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleTypeE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_dense (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kATTN_DENSEE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_k (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_KE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_q (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_QE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_qkv (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kATTN_QKVE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kattn_v (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_VE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_dense (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType17kCROSS_ATTN_DENSEE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_k (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_KE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_q (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_QE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_qkv (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType15kCROSS_ATTN_QKVE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kcross_attn_v (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_VE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kinvalid (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType8kINVALIDE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_4h_to_h (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_4H_TO_HE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_gate (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kMLP_GATEE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_gate_up (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_GATE_UPE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_h_to_4h (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_H_TO_4HE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmlp_router (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kMLP_ROUTERE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmoe_4h_to_h (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMOE_4H_TO_HE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmoe_gate (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kMOE_GATEE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmoe_h_to_4h (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMOE_H_TO_4HE", false]], "tensorrt_llm::runtime::loramodule::moduletype::kmoe_router (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kMOE_ROUTERE", false]], "tensorrt_llm::runtime::loramodule::moutdim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule7mOutDimE", false]], "tensorrt_llm::runtime::loramodule::moutdimfirst (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12mOutDimFirstE", false]], "tensorrt_llm::runtime::loramodule::mouttpsplitdim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule14mOutTpSplitDimE", false]], "tensorrt_llm::runtime::loramodule::mtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule5mTypeE", false]], "tensorrt_llm::runtime::loramodule::name (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule4nameEv", false]], "tensorrt_llm::runtime::loramodule::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleaSERK10LoraModule", false]], "tensorrt_llm::runtime::loramodule::outdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6outDimEv", false]], "tensorrt_llm::runtime::loramodule::outdimfirst (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11outDimFirstEv", false]], "tensorrt_llm::runtime::loramodule::outsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule7outSizeE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::outtpsplitdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule13outTpSplitDimEv", false]], "tensorrt_llm::runtime::loramodule::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule9TensorPtrE", false]], "tensorrt_llm::runtime::loramodule::tomodulename (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10ModuleType", false], [1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10SizeType32", false]], "tensorrt_llm::runtime::loramodule::tomoduletype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleTypeERKNSt11string_viewE", false]], "tensorrt_llm::runtime::loramodule::value (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule5valueEv", false]], "tensorrt_llm::runtime::lorataskidtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14LoraTaskIdTypeE", false]], "tensorrt_llm::runtime::medusamodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModuleE", false]], "tensorrt_llm::runtime::medusamodule::getmedusachoices (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime12MedusaModule16getMedusaChoicesEv", false]], "tensorrt_llm::runtime::medusamodule::mdefaultmedusachoices (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule21mDefaultMedusaChoicesE", false]], "tensorrt_llm::runtime::medusamodule::medusachoices (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule13MedusaChoicesE", false]], "tensorrt_llm::runtime::medusamodule::medusamodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleE10SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleEv", false]], "tensorrt_llm::runtime::medusamodule::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule9TensorPtrE", false]], "tensorrt_llm::runtime::memorycounters (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCountersE", false]], "tensorrt_llm::runtime::memorycounters::allocate (c++ function)": [[1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters8allocateEv10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8allocateE10MemoryType10SizeType32", false]], "tensorrt_llm::runtime::memorycounters::bytestostring (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE10SizeType32i", false], [1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE8DiffTypei", false]], "tensorrt_llm::runtime::memorycounters::deallocate (c++ function)": [[1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters10deallocateEv10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10deallocateE10MemoryType10SizeType32", false]], "tensorrt_llm::runtime::memorycounters::difftype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8DiffTypeE", false]], "tensorrt_llm::runtime::memorycounters::getcpu (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getCpuEv", false]], "tensorrt_llm::runtime::memorycounters::getcpudiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getCpuDiffEv", false]], "tensorrt_llm::runtime::memorycounters::getgpu (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getGpuEv", false]], "tensorrt_llm::runtime::memorycounters::getgpudiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getGpuDiffEv", false]], "tensorrt_llm::runtime::memorycounters::getinstance (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11getInstanceEv", false]], "tensorrt_llm::runtime::memorycounters::getpinned (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters9getPinnedEv", false]], "tensorrt_llm::runtime::memorycounters::getpinneddiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters13getPinnedDiffEv", false]], "tensorrt_llm::runtime::memorycounters::getpinnedpool (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters13getPinnedPoolEv", false]], "tensorrt_llm::runtime::memorycounters::getpinnedpooldiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters17getPinnedPoolDiffEv", false]], "tensorrt_llm::runtime::memorycounters::getuvm (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getUVMEv", false]], "tensorrt_llm::runtime::memorycounters::getuvmdiff (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getUVMDiffEv", false]], "tensorrt_llm::runtime::memorycounters::mcpu (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mCpuE", false]], "tensorrt_llm::runtime::memorycounters::mcpudiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mCpuDiffE", false]], "tensorrt_llm::runtime::memorycounters::memorycounters (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters14MemoryCountersEv", false]], "tensorrt_llm::runtime::memorycounters::mgpu (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mGpuE", false]], "tensorrt_llm::runtime::memorycounters::mgpudiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mGpuDiffE", false]], "tensorrt_llm::runtime::memorycounters::mpinned (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters7mPinnedE", false]], "tensorrt_llm::runtime::memorycounters::mpinneddiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11mPinnedDiffE", false]], "tensorrt_llm::runtime::memorycounters::mpinnedpool (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11mPinnedPoolE", false]], "tensorrt_llm::runtime::memorycounters::mpinnedpooldiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters15mPinnedPoolDiffE", false]], "tensorrt_llm::runtime::memorycounters::muvm (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mUVME", false]], "tensorrt_llm::runtime::memorycounters::muvmdiff (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mUVMDiffE", false]], "tensorrt_llm::runtime::memorycounters::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10SizeType32E", false]], "tensorrt_llm::runtime::memorycounters::tostring (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters8toStringEv", false]], "tensorrt_llm::runtime::memorytype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryTypeE", false]], "tensorrt_llm::runtime::memorytype::kcpu (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kCPUE", false]], "tensorrt_llm::runtime::memorytype::kgpu (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kGPUE", false]], "tensorrt_llm::runtime::memorytype::kpinned (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType7kPINNEDE", false]], "tensorrt_llm::runtime::memorytype::kpinnedpool (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType11kPINNEDPOOLE", false]], "tensorrt_llm::runtime::memorytype::kuvm (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kUVME", false]], "tensorrt_llm::runtime::memorytypestring (c++ struct)": [[1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime16MemoryTypeStringE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kcpu> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kCPUEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kcpu>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kCPUEE5valueE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kgpu> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kGPUEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kgpu>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kGPUEE5valueE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kpinned> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType7kPINNEDEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kpinned>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType7kPINNEDEE5valueE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kpinnedpool> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType11kPINNEDPOOLEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kpinnedpool>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType11kPINNEDPOOLEE5valueE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kuvm> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kUVMEEE", false]], "tensorrt_llm::runtime::memorytypestring<memorytype::kuvm>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kUVMEE5valueE", false]], "tensorrt_llm::runtime::modelconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfigE", false]], "tensorrt_llm::runtime::modelconfig::computecontextlogits (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEv", false]], "tensorrt_llm::runtime::modelconfig::computegenerationlogits (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEv", false]], "tensorrt_llm::runtime::modelconfig::countlocallayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::countlowerranklayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::disableseamlesslookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig32disableSeamlessLookaheadDecodingEv", false]], "tensorrt_llm::runtime::modelconfig::enableseamlesslookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31enableSeamlessLookaheadDecodingE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getcontextfmha (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getContextFMHAEv", false]], "tensorrt_llm::runtime::modelconfig::getdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getDataTypeEv", false]], "tensorrt_llm::runtime::modelconfig::getencoderhiddensize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getEncoderHiddenSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getfirstlocallayer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getgemmallreducedtype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getGemmAllReduceDtypeEv", false]], "tensorrt_llm::runtime::modelconfig::gethiddensize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getHiddenSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getkvcachetype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getKVCacheTypeEv", false]], "tensorrt_llm::runtime::modelconfig::getkvdatatype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getKvDataTypeEv", false]], "tensorrt_llm::runtime::modelconfig::getlayertypes (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getLayerTypesEv", false]], "tensorrt_llm::runtime::modelconfig::getlogitsdtype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getLogitsDtypeEv", false]], "tensorrt_llm::runtime::modelconfig::getloramodules (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getLoraModulesEv", false]], "tensorrt_llm::runtime::modelconfig::getmanageweightstype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getManageWeightsTypeEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxbatchsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxBatchSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxbeamwidth (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxBeamWidthEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxdecodingdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig25getMaxDecodingDraftTokensEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxdecodingtokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getMaxDecodingTokensEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxencoderlen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16getMaxEncoderLenEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxinputlen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getMaxInputLenEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxlorarank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getMaxLoraRankEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxnumtokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxNumTokensEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxpositionembeddings (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig24getMaxPositionEmbeddingsEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxpromptembeddingtablesize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig30getMaxPromptEmbeddingTableSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getmaxsequencelen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17getMaxSequenceLenEv", false]], "tensorrt_llm::runtime::modelconfig::getmlphiddensize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16getMlpHiddenSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getmodelname (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getModelNameEv", false]], "tensorrt_llm::runtime::modelconfig::getmodelvariant (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getModelVariantEv", false]], "tensorrt_llm::runtime::modelconfig::getnbattentionlayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getnbheads (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig10getNbHeadsEv", false]], "tensorrt_llm::runtime::modelconfig::getnbkvheads (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getNbKvHeadsE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getnblayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getnbrnnlayers (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32", false]], "tensorrt_llm::runtime::modelconfig::getnumkvheadsperlayer (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getNumKvHeadsPerLayerEv", false]], "tensorrt_llm::runtime::modelconfig::getnumkvheadsperlayerlocalrange (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", false]], "tensorrt_llm::runtime::modelconfig::getnumlanguages (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getNumLanguagesEv", false]], "tensorrt_llm::runtime::modelconfig::getoptprofilessplitpoints (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig25getOptProfilesSplitPointsEv", false]], "tensorrt_llm::runtime::modelconfig::getpagedcontextfmha (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19getPagedContextFMHAEv", false]], "tensorrt_llm::runtime::modelconfig::getppreducescatter (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getPpReduceScatterEv", false]], "tensorrt_llm::runtime::modelconfig::getquantmode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getQuantModeEv", false]], "tensorrt_llm::runtime::modelconfig::getrnnconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getRnnConfigEv", false]], "tensorrt_llm::runtime::modelconfig::getrotaryembeddingdim (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getRotaryEmbeddingDimEv", false]], "tensorrt_llm::runtime::modelconfig::getsizeperhead (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getSizePerHeadEv", false]], "tensorrt_llm::runtime::modelconfig::getspeculativedecodingmode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig26getSpeculativeDecodingModeEv", false]], "tensorrt_llm::runtime::modelconfig::getspeculativedecodingmodule (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig28getSpeculativeDecodingModuleEv", false]], "tensorrt_llm::runtime::modelconfig::getspeculativedecodingmoduleptr (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31getSpeculativeDecodingModulePtrEv", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getSpeculativeDecodingModulePtrEv", false]], "tensorrt_llm::runtime::modelconfig::getsumlocalkvheads (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", false]], "tensorrt_llm::runtime::modelconfig::gettokensperblock (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17getTokensPerBlockEv", false]], "tensorrt_llm::runtime::modelconfig::getvocabsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getVocabSizeEv", false]], "tensorrt_llm::runtime::modelconfig::getvocabsizepadded (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getVocabSizePaddedE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::hasrnnconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12hasRnnConfigEv", false]], "tensorrt_llm::runtime::modelconfig::hasspeculativedecodingmodule (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig28hasSpeculativeDecodingModuleEv", false]], "tensorrt_llm::runtime::modelconfig::iscontinuouskvcache (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19isContinuousKVCacheEv", false]], "tensorrt_llm::runtime::modelconfig::iskvcacheenabled (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16isKVCacheEnabledEv", false]], "tensorrt_llm::runtime::modelconfig::ismultimodal (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12isMultiModalEv", false]], "tensorrt_llm::runtime::modelconfig::ispagedkvcache (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14isPagedKVCacheEv", false]], "tensorrt_llm::runtime::modelconfig::isrnnbased (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig10isRnnBasedEv", false]], "tensorrt_llm::runtime::modelconfig::istransformerbased (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18isTransformerBasedEv", false]], "tensorrt_llm::runtime::modelconfig::iswhisper (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig9isWhisperEv", false]], "tensorrt_llm::runtime::modelconfig::kdefault_num_tokens_per_block (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig29kDEFAULT_NUM_TOKENS_PER_BLOCKE", false]], "tensorrt_llm::runtime::modelconfig::kopt_profiles_split_points (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26kOPT_PROFILES_SPLIT_POINTSE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheTypeE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetype::kcontinuous (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType11kCONTINUOUSE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetype::kdisabled (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType9kDISABLEDE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetype::kpaged (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType6kPAGEDE", false]], "tensorrt_llm::runtime::modelconfig::kvcachetypefromstring (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21KVCacheTypeFromStringENSt6stringE", false]], "tensorrt_llm::runtime::modelconfig::layertype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerTypeE", false]], "tensorrt_llm::runtime::modelconfig::layertype::kattention (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType10kATTENTIONE", false]], "tensorrt_llm::runtime::modelconfig::layertype::klinear (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType7kLINEARE", false]], "tensorrt_llm::runtime::modelconfig::layertype::knoop (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType5kNOOPE", false]], "tensorrt_llm::runtime::modelconfig::layertype::krecurrent (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType10kRECURRENTE", false]], "tensorrt_llm::runtime::modelconfig::manageweightstype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsTypeE", false]], "tensorrt_llm::runtime::modelconfig::manageweightstype::kdisabled (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsType9kDisabledE", false]], "tensorrt_llm::runtime::modelconfig::manageweightstype::kenabled (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsType8kEnabledE", false]], "tensorrt_llm::runtime::modelconfig::mcomputecontextlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mComputeContextLogitsE", false]], "tensorrt_llm::runtime::modelconfig::mcomputegenerationlogits (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24mComputeGenerationLogitsE", false]], "tensorrt_llm::runtime::modelconfig::mcontextfmha (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mContextFMHAE", false]], "tensorrt_llm::runtime::modelconfig::mdatatype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mDataTypeE", false]], "tensorrt_llm::runtime::modelconfig::mencoderhiddensize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mEncoderHiddenSizeE", false]], "tensorrt_llm::runtime::modelconfig::mgemmallreducedtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19mGemmAllReduceDtypeE", false]], "tensorrt_llm::runtime::modelconfig::mhiddensize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mHiddenSizeE", false]], "tensorrt_llm::runtime::modelconfig::minputpacked (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mInputPackedE", false]], "tensorrt_llm::runtime::modelconfig::mkvcachetype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mKVCacheTypeE", false]], "tensorrt_llm::runtime::modelconfig::mlayertypes (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mLayerTypesE", false]], "tensorrt_llm::runtime::modelconfig::mlogitsdtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mLogitsDtypeE", false]], "tensorrt_llm::runtime::modelconfig::mloramodules (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mLoraModulesE", false]], "tensorrt_llm::runtime::modelconfig::mmanageweightstype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mManageWeightsTypeE", false]], "tensorrt_llm::runtime::modelconfig::mmaxbatchsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxBatchSizeE", false]], "tensorrt_llm::runtime::modelconfig::mmaxbeamwidth (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxBeamWidthE", false]], "tensorrt_llm::runtime::modelconfig::mmaxencoderlen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mMaxEncoderLenE", false]], "tensorrt_llm::runtime::modelconfig::mmaxinputlen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mMaxInputLenE", false]], "tensorrt_llm::runtime::modelconfig::mmaxlorarank (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mMaxLoraRankE", false]], "tensorrt_llm::runtime::modelconfig::mmaxnumtokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxNumTokensE", false]], "tensorrt_llm::runtime::modelconfig::mmaxpositionembeddings (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mMaxPositionEmbeddingsE", false]], "tensorrt_llm::runtime::modelconfig::mmaxpromptembeddingtablesize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28mMaxPromptEmbeddingTableSizeE", false]], "tensorrt_llm::runtime::modelconfig::mmaxsequencelen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15mMaxSequenceLenE", false]], "tensorrt_llm::runtime::modelconfig::mmlphiddensize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mMlpHiddenSizeE", false]], "tensorrt_llm::runtime::modelconfig::mmodelname (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mModelNameE", false]], "tensorrt_llm::runtime::modelconfig::mmodelvariant (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mModelVariantE", false]], "tensorrt_llm::runtime::modelconfig::mnbattentionlayers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mNbAttentionLayersE", false]], "tensorrt_llm::runtime::modelconfig::mnbheads (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig8mNbHeadsE", false]], "tensorrt_llm::runtime::modelconfig::mnblayers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mNbLayersE", false]], "tensorrt_llm::runtime::modelconfig::mnbrnnlayers (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mNbRnnLayersE", false]], "tensorrt_llm::runtime::modelconfig::mnumkvheadsperattentionlayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28mNumKvHeadsPerAttentionLayerE", false]], "tensorrt_llm::runtime::modelconfig::mnumkvheadspercrossattentionlayer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig33mNumKvHeadsPerCrossAttentionLayerE", false]], "tensorrt_llm::runtime::modelconfig::mnumlanguages (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mNumLanguagesE", false]], "tensorrt_llm::runtime::modelconfig::modelconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariantE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kchatglm (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant8kChatGlmE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kencdec (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant7kEncDecE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kglm (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant4kGlmE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kgpt (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant4kGptE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::kmamba (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant6kMambaE", false]], "tensorrt_llm::runtime::modelconfig::modelvariant::krecurrentgemma (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant15kRecurrentGemmaE", false]], "tensorrt_llm::runtime::modelconfig::mpagedcontextfmha (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17mPagedContextFMHAE", false]], "tensorrt_llm::runtime::modelconfig::mpagedstate (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mPagedStateE", false]], "tensorrt_llm::runtime::modelconfig::mppreducescatter (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16mPpReduceScatterE", false]], "tensorrt_llm::runtime::modelconfig::mquantmode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mQuantModeE", false]], "tensorrt_llm::runtime::modelconfig::mrnnconfig (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mRnnConfigE", false]], "tensorrt_llm::runtime::modelconfig::mrotaryembeddingdim (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19mRotaryEmbeddingDimE", false]], "tensorrt_llm::runtime::modelconfig::msizeperhead (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mSizePerHeadE", false]], "tensorrt_llm::runtime::modelconfig::mskipcrossattnblocks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20mSkipCrossAttnBlocksE", false]], "tensorrt_llm::runtime::modelconfig::mspeculativedecodingmode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24mSpeculativeDecodingModeE", false]], "tensorrt_llm::runtime::modelconfig::mspeculativedecodingmodule (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26mSpeculativeDecodingModuleE", false]], "tensorrt_llm::runtime::modelconfig::mtokensperblock (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15mTokensPerBlockE", false]], "tensorrt_llm::runtime::modelconfig::musecrossattention (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mUseCrossAttentionE", false]], "tensorrt_llm::runtime::modelconfig::musegemmallreduceplugin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23mUseGemmAllReducePluginE", false]], "tensorrt_llm::runtime::modelconfig::musegptattentionplugin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mUseGptAttentionPluginE", false]], "tensorrt_llm::runtime::modelconfig::museloraplugin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mUseLoraPluginE", false]], "tensorrt_llm::runtime::modelconfig::musemambaconv1dplugin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mUseMambaConv1dPluginE", false]], "tensorrt_llm::runtime::modelconfig::musemrope (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mUseMropeE", false]], "tensorrt_llm::runtime::modelconfig::musepositionembedding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mUsePositionEmbeddingE", false]], "tensorrt_llm::runtime::modelconfig::museshapeinference (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mUseShapeInferenceE", false]], "tensorrt_llm::runtime::modelconfig::musetokentypeembedding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mUseTokenTypeEmbeddingE", false]], "tensorrt_llm::runtime::modelconfig::mvocabsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mVocabSizeE", false]], "tensorrt_llm::runtime::modelconfig::resetspeculativedecodingmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30resetSpeculativeDecodingModuleEv", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfigE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::convkernel (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig10convKernelE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::rnnconvdimsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig14rnnConvDimSizeE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::rnnheadsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig11rnnHeadSizeE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::rnnhiddensize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig13rnnHiddenSizeE", false]], "tensorrt_llm::runtime::modelconfig::rnnconfig::statesize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig9stateSizeE", false]], "tensorrt_llm::runtime::modelconfig::setcontextfmha (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setContextFMHAEb", false]], "tensorrt_llm::runtime::modelconfig::setencoderhiddensize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setEncoderHiddenSizeE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setgemmallreducedtype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setGemmAllReduceDtypeEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::modelconfig::setkvcachetype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setKVCacheTypeE11KVCacheType", false]], "tensorrt_llm::runtime::modelconfig::setlayertypes (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13setLayerTypesERKNSt6vectorI9LayerTypeEE", false]], "tensorrt_llm::runtime::modelconfig::setlogitsdtype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLogitsDtypeEN8nvinfer18DataTypeE", false]], "tensorrt_llm::runtime::modelconfig::setloramodules (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLoraModulesERKNSt6vectorI10LoraModuleEE", false]], "tensorrt_llm::runtime::modelconfig::setmanageweightstype (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setManageWeightsTypeEK17ManageWeightsType", false]], "tensorrt_llm::runtime::modelconfig::setmaxbatchsize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBatchSizeE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxbeamwidth (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBeamWidthE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxencoderlen (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMaxEncoderLenE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxinputlen (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxInputLenE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxlorarank (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxLoraRankE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxnumtokens (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxNumTokensENSt8optionalI10SizeType32EE", false]], "tensorrt_llm::runtime::modelconfig::setmaxpositionembeddings (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setMaxPositionEmbeddingsE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxpromptembeddingtablesize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30setMaxPromptEmbeddingTableSizeE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmaxsequencelen (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setMaxSequenceLenE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmlphiddensize (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMlpHiddenSizeE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setmodelname (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setModelNameERKNSt6stringE", false]], "tensorrt_llm::runtime::modelconfig::setmodelvariant (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setModelVariantE12ModelVariant", false]], "tensorrt_llm::runtime::modelconfig::setnbcrosskvheads (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setNbCrossKvHeadsE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setnbkvheads (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setNbKvHeadsE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setnumkvheadspercrosslayer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setNumKvHeadsPerCrossLayerERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::runtime::modelconfig::setnumkvheadsperlayer (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setNumKvHeadsPerLayerERKNSt6vectorI10SizeType32EE", false]], "tensorrt_llm::runtime::modelconfig::setnumlanguages (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setNumLanguagesENSt8optionalI10SizeType32EE", false]], "tensorrt_llm::runtime::modelconfig::setpagedcontextfmha (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19setPagedContextFMHAEb", false]], "tensorrt_llm::runtime::modelconfig::setppreducescatter (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18setPpReduceScatterEb", false]], "tensorrt_llm::runtime::modelconfig::setquantmode (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setQuantModeEN6common9QuantModeE", false]], "tensorrt_llm::runtime::modelconfig::setrnnconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setRnnConfigERK9RnnConfig", false]], "tensorrt_llm::runtime::modelconfig::setrotaryembeddingdim (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setRotaryEmbeddingDimE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setsizeperhead (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setSizePerHeadE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setskipcrossattnblocks (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22setSkipCrossAttnBlocksEb", false]], "tensorrt_llm::runtime::modelconfig::setspeculativedecodingmode (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setSpeculativeDecodingModeE23SpeculativeDecodingMode", false]], "tensorrt_llm::runtime::modelconfig::setspeculativedecodingmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28setSpeculativeDecodingModuleERKNSt10shared_ptrI25SpeculativeDecodingModuleEE", false]], "tensorrt_llm::runtime::modelconfig::settokensperblock (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setTokensPerBlockE10SizeType32", false]], "tensorrt_llm::runtime::modelconfig::setusecrossattention (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseCrossAttentionEb", false]], "tensorrt_llm::runtime::modelconfig::setusemrope (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11setUseMropeEb", false]], "tensorrt_llm::runtime::modelconfig::setusepositionembedding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23setUsePositionEmbeddingEb", false]], "tensorrt_llm::runtime::modelconfig::setuseshapeinference (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseShapeInferenceEb", false]], "tensorrt_llm::runtime::modelconfig::setusetokentypeembedding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setUseTokenTypeEmbeddingEb", false]], "tensorrt_llm::runtime::modelconfig::skipcrossattnblocks (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19skipCrossAttnBlocksEv", false]], "tensorrt_llm::runtime::modelconfig::supportsinflightbatching (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig24supportsInflightBatchingEv", false]], "tensorrt_llm::runtime::modelconfig::usecrossattention (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17useCrossAttentionEv", false]], "tensorrt_llm::runtime::modelconfig::usegemmallreduceplugin (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEv", false]], "tensorrt_llm::runtime::modelconfig::usegptattentionplugin (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEv", false]], "tensorrt_llm::runtime::modelconfig::uselanguageadapter (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18useLanguageAdapterEv", false]], "tensorrt_llm::runtime::modelconfig::useloraplugin (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13useLoraPluginEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13useLoraPluginEv", false]], "tensorrt_llm::runtime::modelconfig::usemambaconv1dplugin (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEv", false]], "tensorrt_llm::runtime::modelconfig::usemrope (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig8useMropeEv", false]], "tensorrt_llm::runtime::modelconfig::usepackedinput (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14usePackedInputEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14usePackedInputEv", false]], "tensorrt_llm::runtime::modelconfig::usepagedstate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13usePagedStateEb", false], [1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13usePagedStateEv", false]], "tensorrt_llm::runtime::modelconfig::usepositionembedding (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20usePositionEmbeddingEv", false]], "tensorrt_llm::runtime::modelconfig::useprompttuning (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15usePromptTuningEv", false]], "tensorrt_llm::runtime::modelconfig::useshapeinference (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17useShapeInferenceEv", false]], "tensorrt_llm::runtime::modelconfig::usetokentypeembedding (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21useTokenTypeEmbeddingEv", false]], "tensorrt_llm::runtime::mpi_group_barrier (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime17MPI_group_barrierENSt3setIiEE", false]], "tensorrt_llm::runtime::operator<< (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK10LoraModule", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK26LoraCachePageManagerConfig", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7IBuffer", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7ITensor", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN7ITensor5ShapeE", false], [1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN9LoraCache21TaskLayerModuleConfigE", false]], "tensorrt_llm::runtime::pointerelementtype (c++ type)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime18PointerElementTypeE", false]], "tensorrt_llm::runtime::prompttuningparams (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParamsE", false]], "tensorrt_llm::runtime::prompttuningparams::filltaskstensor (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", false]], "tensorrt_llm::runtime::prompttuningparams::prompttuningparams (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", false]], "tensorrt_llm::runtime::prompttuningparams::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams10SizeType32E", false]], "tensorrt_llm::runtime::prompttuningparams::tensorptr (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams9TensorPtrE", false]], "tensorrt_llm::runtime::rawengine (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngineE", false]], "tensorrt_llm::runtime::rawengine::getaddress (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine10getAddressEv", false]], "tensorrt_llm::runtime::rawengine::gethostmemory (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine13getHostMemoryEv", false]], "tensorrt_llm::runtime::rawengine::getmanagedweightsmapopt (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine23getManagedWeightsMapOptEv", false]], "tensorrt_llm::runtime::rawengine::getpath (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getPathEv", false]], "tensorrt_llm::runtime::rawengine::getpathopt (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine10getPathOptEv", false]], "tensorrt_llm::runtime::rawengine::getsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getSizeEv", false]], "tensorrt_llm::runtime::rawengine::gettype (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getTypeEv", false]], "tensorrt_llm::runtime::rawengine::mengineaddr (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEngineAddrE", false]], "tensorrt_llm::runtime::rawengine::menginebuffer (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine13mEngineBufferE", false]], "tensorrt_llm::runtime::rawengine::menginepath (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEnginePathE", false]], "tensorrt_llm::runtime::rawengine::menginesize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEngineSizeE", false]], "tensorrt_llm::runtime::rawengine::mmanagedweightsmap (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine18mManagedWeightsMapE", false]], "tensorrt_llm::runtime::rawengine::mtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine5mTypeE", false]], "tensorrt_llm::runtime::rawengine::rawengine (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineENSt10filesystem4pathE", false], [1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKN8nvinfer111IHostMemoryE", false], [1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKvNSt6size_tE", false]], "tensorrt_llm::runtime::rawengine::setmanagedweightsmap (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine20setManagedWeightsMapENSt3mapINSt6stringEN12tensorrt_llm8executor6TensorEEE", false]], "tensorrt_llm::runtime::rawengine::setpath (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine7setPathENSt10filesystem4pathE", false]], "tensorrt_llm::runtime::rawengine::type (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4TypeE", false]], "tensorrt_llm::runtime::rawengine::type::addresswithsize (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type15AddressWithSizeE", false]], "tensorrt_llm::runtime::rawengine::type::filepath (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type8FilePathE", false]], "tensorrt_llm::runtime::rawengine::type::hostmemory (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type10HostMemoryE", false]], "tensorrt_llm::runtime::requesttype (c++ enum)": [[1, "_CPPv4N12tensorrt_llm7runtime11RequestTypeE", false]], "tensorrt_llm::runtime::requesttype::kcontext (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11RequestType8kCONTEXTE", false]], "tensorrt_llm::runtime::requesttype::kgeneration (c++ enumerator)": [[1, "_CPPv4N12tensorrt_llm7runtime11RequestType11kGENERATIONE", false]], "tensorrt_llm::runtime::runtimedefaults (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaultsE", false]], "tensorrt_llm::runtime::runtimedefaults::maxattentionwindowvec (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults21maxAttentionWindowVecE", false]], "tensorrt_llm::runtime::runtimedefaults::runtimedefaults (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalI10SizeType32EE", false], [1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsEv", false]], "tensorrt_llm::runtime::runtimedefaults::sinktokenlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15sinkTokenLengthE", false]], "tensorrt_llm::runtime::samplingconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfigE", false]], "tensorrt_llm::runtime::samplingconfig::beamsearchdiversityrate (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig23beamSearchDiversityRateE", false]], "tensorrt_llm::runtime::samplingconfig::beamwidth (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9beamWidthE", false]], "tensorrt_llm::runtime::samplingconfig::beamwidtharray (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14beamWidthArrayE", false]], "tensorrt_llm::runtime::samplingconfig::cumlogprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig11cumLogProbsE", false]], "tensorrt_llm::runtime::samplingconfig::draftacceptancethreshold (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig24draftAcceptanceThresholdE", false]], "tensorrt_llm::runtime::samplingconfig::earlystopping (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig13earlyStoppingE", false]], "tensorrt_llm::runtime::samplingconfig::floattype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9FloatTypeE", false]], "tensorrt_llm::runtime::samplingconfig::frequencypenalty (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig16frequencyPenaltyE", false]], "tensorrt_llm::runtime::samplingconfig::fusevalues (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", false]], "tensorrt_llm::runtime::samplingconfig::getmaxbeamwidth (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfig15getMaxBeamWidthEv", false]], "tensorrt_llm::runtime::samplingconfig::getnumreturnbeams (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfig17getNumReturnBeamsEv", false]], "tensorrt_llm::runtime::samplingconfig::lengthpenalty (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig13lengthPenaltyE", false]], "tensorrt_llm::runtime::samplingconfig::minlength (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9minLengthE", false]], "tensorrt_llm::runtime::samplingconfig::minp (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4minPE", false]], "tensorrt_llm::runtime::samplingconfig::norepeatngramsize (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17noRepeatNgramSizeE", false]], "tensorrt_llm::runtime::samplingconfig::normalizelogprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17normalizeLogProbsE", false]], "tensorrt_llm::runtime::samplingconfig::numreturnsequences (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig18numReturnSequencesE", false]], "tensorrt_llm::runtime::samplingconfig::operator== (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfigeqERK14SamplingConfig", false]], "tensorrt_llm::runtime::samplingconfig::optvec (c++ type)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig6OptVecE", false]], "tensorrt_llm::runtime::samplingconfig::originaltemperature (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig19originalTemperatureE", false]], "tensorrt_llm::runtime::samplingconfig::outputlogprobs (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14outputLogProbsE", false]], "tensorrt_llm::runtime::samplingconfig::presencepenalty (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig15presencePenaltyE", false]], "tensorrt_llm::runtime::samplingconfig::randomseed (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig10randomSeedE", false]], "tensorrt_llm::runtime::samplingconfig::repetitionpenalty (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17repetitionPenaltyE", false]], "tensorrt_llm::runtime::samplingconfig::samplingconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigE10SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKN8executor14SamplingConfigERKNSt8optionalIN8executor25ExternalDraftTokensConfigEEE", false], [1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKNSt6vectorI14SamplingConfigEE", false]], "tensorrt_llm::runtime::samplingconfig::temperature (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig11temperatureE", false]], "tensorrt_llm::runtime::samplingconfig::topk (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4topKE", false]], "tensorrt_llm::runtime::samplingconfig::topkmedusaheads (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig15topKMedusaHeadsE", false]], "tensorrt_llm::runtime::samplingconfig::topp (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4topPE", false]], "tensorrt_llm::runtime::samplingconfig::toppdecay (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9topPDecayE", false]], "tensorrt_llm::runtime::samplingconfig::toppmin (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig7topPMinE", false]], "tensorrt_llm::runtime::samplingconfig::toppresetids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig12topPResetIdsE", false]], "tensorrt_llm::runtime::samplingconfig::usedefaultvalues (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", false]], "tensorrt_llm::runtime::samplingconfig::validate (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig8validateEv", false]], "tensorrt_llm::runtime::samplingconfig::validatevec (c++ function)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", false]], "tensorrt_llm::runtime::sizetype32 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10SizeType32E", false]], "tensorrt_llm::runtime::sizetype64 (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime10SizeType64E", false]], "tensorrt_llm::runtime::speculativedecodingmode (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingModeE", false]], "tensorrt_llm::runtime::speculativedecodingmode::allbitset (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9allBitSetE14UnderlyingType", false]], "tensorrt_llm::runtime::speculativedecodingmode::anybitset (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9anyBitSetE14UnderlyingType", false]], "tensorrt_llm::runtime::speculativedecodingmode::drafttokensexternal (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode19DraftTokensExternalEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::eagle (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode5EagleEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::explicitdrafttokens (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode19ExplicitDraftTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::hasdraftlogits (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode14hasDraftLogitsEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::isdrafttokensexternal (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21isDraftTokensExternalEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::iseagle (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode7isEagleEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::isexplicitdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21isExplicitDraftTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::islookaheaddecoding (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19isLookaheadDecodingEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::ismedusa (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode8isMedusaEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::isnone (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode6isNoneEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::kdrafttokensexternal (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode20kDraftTokensExternalE", false]], "tensorrt_llm::runtime::speculativedecodingmode::keagle (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6kEagleE", false]], "tensorrt_llm::runtime::speculativedecodingmode::kexplicitdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode20kExplicitDraftTokensE", false]], "tensorrt_llm::runtime::speculativedecodingmode::klookaheaddecoding (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode18kLookaheadDecodingE", false]], "tensorrt_llm::runtime::speculativedecodingmode::kmedusa (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode7kMedusaE", false]], "tensorrt_llm::runtime::speculativedecodingmode::knone (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode5kNoneE", false]], "tensorrt_llm::runtime::speculativedecodingmode::lookaheaddecoding (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode17LookaheadDecodingEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::medusa (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6MedusaEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::mstate (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6mStateE", false]], "tensorrt_llm::runtime::speculativedecodingmode::needsdecoderprologue (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode20needsDecoderPrologueEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::needskvcacherewind (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode18needsKVCacheRewindEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::none (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode4NoneEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::operator== (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingModeeqERK23SpeculativeDecodingMode", false]], "tensorrt_llm::runtime::speculativedecodingmode::predictsdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19predictsDraftTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::requiresattentionmask (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21requiresAttentionMaskEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::speculativedecodingmode (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode23SpeculativeDecodingModeE14UnderlyingType", false]], "tensorrt_llm::runtime::speculativedecodingmode::underlyingtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode14UnderlyingTypeE", false]], "tensorrt_llm::runtime::speculativedecodingmode::updatespositionids (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode18updatesPositionIdsEv", false]], "tensorrt_llm::runtime::speculativedecodingmode::variabledraftlength (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19variableDraftLengthEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::computenumpackedmasks (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule21computeNumPackedMasksEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxdecodingdrafttokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule25getMaxDecodingDraftTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxdecodingtokens (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule20getMaxDecodingTokensEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxdraftpathlen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule18getMaxDraftPathLenEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxnumpaths (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule14getMaxNumPathsEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getmaxpathlen (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule13getMaxPathLenEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::getnumpackedmasks (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule17getNumPackedMasksEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::mmaxdecodingdrafttokens (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule23mMaxDecodingDraftTokensE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::mmaxdraftpathlen (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule16mMaxDraftPathLenE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::mmaxnumpackedmasks (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18mMaxNumPackedMasksE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::mmaxnumpaths (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule12mMaxNumPathsE", false]], "tensorrt_llm::runtime::speculativedecodingmodule::operator= (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleaSERK25SpeculativeDecodingModule", false]], "tensorrt_llm::runtime::speculativedecodingmodule::setmaxdraftpathlen (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18setMaxDraftPathLenE10SizeType32", false]], "tensorrt_llm::runtime::speculativedecodingmodule::setmaxdrafttokens (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule17setMaxDraftTokensE10SizeType32", false]], "tensorrt_llm::runtime::speculativedecodingmodule::setmaxnumpaths (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule14setMaxNumPathsE10SizeType32", false]], "tensorrt_llm::runtime::speculativedecodingmodule::speculativedecodingmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", false], [1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleERK25SpeculativeDecodingModule", false], [1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleEv", false]], "tensorrt_llm::runtime::speculativedecodingmodule::~speculativedecodingmodule (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleD0Ev", false]], "tensorrt_llm::runtime::stringptrmap (c++ type)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime12StringPtrMapE", false]], "tensorrt_llm::runtime::tllmlogger (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime10TllmLoggerE", false]], "tensorrt_llm::runtime::tllmlogger::getlevel (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8getLevelEv", false]], "tensorrt_llm::runtime::tllmlogger::log (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger3logE8SeverityPKN8nvinfer19AsciiCharE", false]], "tensorrt_llm::runtime::tllmlogger::setlevel (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8setLevelE8Severity", false]], "tensorrt_llm::runtime::to_string (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime9to_stringERK26LoraCachePageManagerConfig", false], [1, "_CPPv4N12tensorrt_llm7runtime9to_stringERKN9LoraCache21TaskLayerModuleConfigE", false]], "tensorrt_llm::runtime::tokenextraidtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime16TokenExtraIdTypeE", false]], "tensorrt_llm::runtime::tokenidtype (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime11TokenIdTypeE", false]], "tensorrt_llm::runtime::trtdatatype (c++ struct)": [[1, "_CPPv4I0_bEN12tensorrt_llm7runtime11TRTDataTypeE", false]], "tensorrt_llm::runtime::trtdatatype<bool> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIbEE", false]], "tensorrt_llm::runtime::trtdatatype<bool>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIbE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<float> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIfEE", false]], "tensorrt_llm::runtime::trtdatatype<float>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIfE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<half> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeI4halfEE", false]], "tensorrt_llm::runtime::trtdatatype<half>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeI4halfE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<kernels::finishedstate> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7kernels13FinishedStateEEE", false]], "tensorrt_llm::runtime::trtdatatype<kernels::finishedstate>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7kernels13FinishedStateEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<kernels::kvcacheindex> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7kernels12KVCacheIndexEEE", false]], "tensorrt_llm::runtime::trtdatatype<kernels::kvcacheindex>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7kernels12KVCacheIndexEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<runtime::requesttype> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7runtime11RequestTypeEEE", false]], "tensorrt_llm::runtime::trtdatatype<runtime::requesttype>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7runtime11RequestTypeEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::int32_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7int32_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::int32_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7int32_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::int64_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7int64_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::int64_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7int64_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::int8_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt6int8_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::int8_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt6int8_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint32_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt8uint32_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint32_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt8uint32_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint64_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt8uint64_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint64_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt8uint64_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint8_t> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7uint8_tEEE", false]], "tensorrt_llm::runtime::trtdatatype<std::uint8_t>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7uint8_tEE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<t*> (c++ struct)": [[1, "_CPPv4I0EN12tensorrt_llm7runtime11TRTDataTypeIP1TEE", false]], "tensorrt_llm::runtime::trtdatatype<t*>::kunderlyingtype (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIP1TE15kUnderlyingTypeE", false]], "tensorrt_llm::runtime::trtdatatype<t*>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIP1TE5valueE", false]], "tensorrt_llm::runtime::trtdatatype<void*> (c++ struct)": [[1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIPvEE", false]], "tensorrt_llm::runtime::trtdatatype<void*>::value (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIPvE5valueE", false]], "tensorrt_llm::runtime::uniquetoken (c++ struct)": [[1, "_CPPv4N12tensorrt_llm7runtime11UniqueTokenE", false]], "tensorrt_llm::runtime::uniquetoken::operator== (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11UniqueTokeneqERK11UniqueToken", false]], "tensorrt_llm::runtime::uniquetoken::tokenextraid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11UniqueToken12tokenExtraIdE", false]], "tensorrt_llm::runtime::uniquetoken::tokenid (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11UniqueToken7tokenIdE", false]], "tensorrt_llm::runtime::vectokenextraids (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime16VecTokenExtraIdsE", false]], "tensorrt_llm::runtime::vecuniquetokens (c++ type)": [[1, "_CPPv4N12tensorrt_llm7runtime15VecUniqueTokensE", false]], "tensorrt_llm::runtime::worldconfig (c++ class)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfigE", false]], "tensorrt_llm::runtime::worldconfig::enableattentiondp (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig17enableAttentionDPEv", false]], "tensorrt_llm::runtime::worldconfig::getcontextparallelgroup (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig23getContextParallelGroupEv", false]], "tensorrt_llm::runtime::worldconfig::getcontextparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig21getContextParallelismEv", false]], "tensorrt_llm::runtime::worldconfig::getcontextparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getContextParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::getdevice (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig9getDeviceEv", false]], "tensorrt_llm::runtime::worldconfig::getdeviceof (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getDeviceOfE10SizeType32", false]], "tensorrt_llm::runtime::worldconfig::getgpuspergroup (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig15getGpusPerGroupEv", false]], "tensorrt_llm::runtime::worldconfig::getgpuspernode (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig14getGpusPerNodeEv", false]], "tensorrt_llm::runtime::worldconfig::getlastrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getLastRankEv", false]], "tensorrt_llm::runtime::worldconfig::getlocalrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig12getLocalRankEv", false]], "tensorrt_llm::runtime::worldconfig::getnoderank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getNodeRankEv", false]], "tensorrt_llm::runtime::worldconfig::getnoderankof (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig13getNodeRankOfE10SizeType32", false]], "tensorrt_llm::runtime::worldconfig::getpipelineparallelgroup (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig24getPipelineParallelGroupEv", false]], "tensorrt_llm::runtime::worldconfig::getpipelineparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getPipelineParallelismEv", false]], "tensorrt_llm::runtime::worldconfig::getpipelineparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig23getPipelineParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::getrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig7getRankEv", false]], "tensorrt_llm::runtime::worldconfig::getsize (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig7getSizeEv", false]], "tensorrt_llm::runtime::worldconfig::gettensorparallelgroup (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getTensorParallelGroupEv", false]], "tensorrt_llm::runtime::worldconfig::gettensorparallelism (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig20getTensorParallelismEv", false]], "tensorrt_llm::runtime::worldconfig::gettensorparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig21getTensorParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::iscontextparallel (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig17isContextParallelEv", false]], "tensorrt_llm::runtime::worldconfig::isfirstcontextparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig26isFirstContextParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::isfirstpipelineparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig27isFirstPipelineParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::isfirsttensorparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig25isFirstTensorParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::islastpipelineparallelrank (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig26isLastPipelineParallelRankEv", false]], "tensorrt_llm::runtime::worldconfig::ispipelineparallel (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig18isPipelineParallelEv", false]], "tensorrt_llm::runtime::worldconfig::istensorparallel (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig16isTensorParallelEv", false]], "tensorrt_llm::runtime::worldconfig::kdefaultgpuspernode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig19kDefaultGpusPerNodeE", false]], "tensorrt_llm::runtime::worldconfig::mcontextparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig19mContextParallelismE", false]], "tensorrt_llm::runtime::worldconfig::mdeviceids (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig10mDeviceIdsE", false]], "tensorrt_llm::runtime::worldconfig::menableattentiondp (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig18mEnableAttentionDPE", false]], "tensorrt_llm::runtime::worldconfig::mgpuspernode (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig12mGpusPerNodeE", false]], "tensorrt_llm::runtime::worldconfig::mpi (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", false]], "tensorrt_llm::runtime::worldconfig::mpipelineparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig20mPipelineParallelismE", false]], "tensorrt_llm::runtime::worldconfig::mrank (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig5mRankE", false]], "tensorrt_llm::runtime::worldconfig::mtensorparallelism (c++ member)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig18mTensorParallelismE", false]], "tensorrt_llm::runtime::worldconfig::validmpiconfig (c++ function)": [[1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig14validMpiConfigEv", false]], "tensorrt_llm::runtime::worldconfig::worldconfig (c++ function)": [[1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", false]], "text (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.text", false]], "text_diff (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.text_diff", false]], "text_diff (tensorrt_llm.llmapi.completionoutput property)": [[70, "id4", false]], "timestepembedding (class in tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.TimestepEmbedding", false]], "timesteps (class in tensorrt_llm.layers.embedding)": [[83, "tensorrt_llm.layers.embedding.Timesteps", false]], "to_dict() (tensorrt_llm.llmapi.buildconfig method)": [[70, "tensorrt_llm.llmapi.BuildConfig.to_dict", false]], "to_dict() (tensorrt_llm.llmapi.calibconfig method)": [[70, "tensorrt_llm.llmapi.CalibConfig.to_dict", false]], "to_dict() (tensorrt_llm.llmapi.quantconfig method)": [[70, "tensorrt_llm.llmapi.QuantConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.chatglmconfig method)": [[84, "tensorrt_llm.models.ChatGLMConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.cogvlmconfig method)": [[84, "tensorrt_llm.models.CogVLMConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.dbrxconfig method)": [[84, "tensorrt_llm.models.DbrxConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.falconconfig method)": [[84, "tensorrt_llm.models.FalconConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.gemmaconfig method)": [[84, "tensorrt_llm.models.GemmaConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.gptconfig method)": [[84, "tensorrt_llm.models.GPTConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.gptjconfig method)": [[84, "tensorrt_llm.models.GPTJConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.llamaconfig method)": [[84, "tensorrt_llm.models.LLaMAConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.medusaconfig method)": [[84, "tensorrt_llm.models.MedusaConfig.to_dict", false]], "to_dict() (tensorrt_llm.models.pretrainedconfig method)": [[84, "tensorrt_llm.models.PretrainedConfig.to_dict", false]], "to_json_file() (tensorrt_llm.models.pretrainedconfig method)": [[84, "tensorrt_llm.models.PretrainedConfig.to_json_file", false]], "to_layer_quant_config() (tensorrt_llm.models.pretrainedconfig method)": [[84, "tensorrt_llm.models.PretrainedConfig.to_layer_quant_config", false]], "to_legacy_setting() (tensorrt_llm.plugin.pluginconfig method)": [[85, "tensorrt_llm.plugin.PluginConfig.to_legacy_setting", false]], "token_drop() (tensorrt_llm.layers.embedding.labelembedding method)": [[83, "tensorrt_llm.layers.embedding.LabelEmbedding.token_drop", false]], "token_end (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig property)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_end", false]], "token_ids (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.token_ids", false]], "token_ids_diff (tensorrt_llm.llmapi.completionoutput attribute)": [[70, "tensorrt_llm.llmapi.CompletionOutput.token_ids_diff", false]], "token_ids_diff (tensorrt_llm.llmapi.completionoutput property)": [[70, "id5", false]], "token_range_retention_configs (tensorrt_llm.llmapi.kvcacheretentionconfig property)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.token_range_retention_configs", false]], "token_start (tensorrt_llm.llmapi.kvcacheretentionconfig.tokenrangeretentionconfig property)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_start", false]], "tokenizer (tensorrt_llm.llmapi.llm attribute)": [[70, "tensorrt_llm.llmapi.LLM.tokenizer", false]], "tokenizer (tensorrt_llm.llmapi.llm property)": [[70, "id0", false]], "tokenizer_image_token() (tensorrt_llm.runtime.multimodalmodelrunner static method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.tokenizer_image_token", false]], "tokenizer_max_seq_length (tensorrt_llm.llmapi.calibconfig attribute)": [[70, "tensorrt_llm.llmapi.CalibConfig.tokenizer_max_seq_length", false]], "tokens_per_block (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.tokens_per_block", false]], "tokens_per_block (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.tokens_per_block", false]], "top_k (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.top_k", false]], "top_k (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.top_k", false]], "top_p (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.top_p", false]], "top_p (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.top_p", false]], "top_p_decay (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.top_p_decay", false]], "top_p_decay (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.top_p_decay", false]], "top_p_min (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.top_p_min", false]], "top_p_min (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.top_p_min", false]], "top_p_reset_ids (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.top_p_reset_ids", false]], "top_p_reset_ids (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.top_p_reset_ids", false]], "topk() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.topk", false]], "torch_compile_enable_userbuffers (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enable_userbuffers", false]], "torch_compile_enabled (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_enabled", false]], "torch_compile_fullgraph (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_fullgraph", false]], "torch_compile_inductor_enabled (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_inductor_enabled", false]], "torch_compile_piecewise_cuda_graph (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.torch_compile_piecewise_cuda_graph", false]], "torchllmargs (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs", false]], "tp_split_dim() (tensorrt_llm.layers.linear.linear class method)": [[83, "tensorrt_llm.layers.linear.Linear.tp_split_dim", false]], "tp_split_dim() (tensorrt_llm.layers.linear.linearbase class method)": [[83, "tensorrt_llm.layers.linear.LinearBase.tp_split_dim", false]], "tp_split_dim() (tensorrt_llm.layers.linear.rowlinear class method)": [[83, "tensorrt_llm.layers.linear.RowLinear.tp_split_dim", false]], "transfer_mode (tensorrt_llm.llmapi.kvcacheretentionconfig property)": [[70, "tensorrt_llm.llmapi.KvCacheRetentionConfig.transfer_mode", false]], "transpose() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.transpose", false]], "transpose() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.transpose", false]], "trtllm-serve-disaggregated command line option": [[30, "cmdoption-trtllm-serve-disaggregated-c", false], [30, "cmdoption-trtllm-serve-disaggregated-r", false], [30, "cmdoption-trtllm-serve-disaggregated-t", false]], "trtllm-serve-disaggregated_mpi_worker command line option": [[30, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", false], [30, "cmdoption-trtllm-serve-disaggregated_mpi_worker-log_level", false]], "trtllm-serve-serve command line option": [[30, "cmdoption-trtllm-serve-serve-arg-MODEL", false], [30, "cmdoption-trtllm-serve-serve-backend", false], [30, "cmdoption-trtllm-serve-serve-cluster_size", false], [30, "cmdoption-trtllm-serve-serve-ep_size", false], [30, "cmdoption-trtllm-serve-serve-extra_llm_api_options", false], [30, "cmdoption-trtllm-serve-serve-gpus_per_node", false], [30, "cmdoption-trtllm-serve-serve-host", false], [30, "cmdoption-trtllm-serve-serve-kv_cache_free_gpu_memory_fraction", false], [30, "cmdoption-trtllm-serve-serve-log_level", false], [30, "cmdoption-trtllm-serve-serve-max_batch_size", false], [30, "cmdoption-trtllm-serve-serve-max_beam_width", false], [30, "cmdoption-trtllm-serve-serve-max_num_tokens", false], [30, "cmdoption-trtllm-serve-serve-max_seq_len", false], [30, "cmdoption-trtllm-serve-serve-num_postprocess_workers", false], [30, "cmdoption-trtllm-serve-serve-port", false], [30, "cmdoption-trtllm-serve-serve-pp_size", false], [30, "cmdoption-trtllm-serve-serve-reasoning_parser", false], [30, "cmdoption-trtllm-serve-serve-tokenizer", false], [30, "cmdoption-trtllm-serve-serve-tp_size", false], [30, "cmdoption-trtllm-serve-serve-trust_remote_code", false]], "trtllm_modules_to_hf_modules (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.trtllm_modules_to_hf_modules", false]], "trtllmargs (class in tensorrt_llm.llmapi)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs", false]], "truncate_prompt_tokens (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.truncate_prompt_tokens", false]], "twoshot (tensorrt_llm.functional.allreducestrategy attribute)": [[82, "tensorrt_llm.functional.AllReduceStrategy.TWOSHOT", false]], "ub (tensorrt_llm.functional.allreducestrategy attribute)": [[82, "tensorrt_llm.functional.AllReduceStrategy.UB", false]], "unary() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.unary", false]], "unbind() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.unbind", false]], "unbind() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.unbind", false]], "unfuse_qkv_projections() (tensorrt_llm.models.sd3transformer2dmodel method)": [[84, "tensorrt_llm.models.SD3Transformer2DModel.unfuse_qkv_projections", false]], "unpatchify() (tensorrt_llm.models.dit method)": [[84, "tensorrt_llm.models.DiT.unpatchify", false]], "unsqueeze() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.unsqueeze", false]], "unsqueeze() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.unsqueeze", false]], "update() (tensorrt_llm.llmapi.buildconfig method)": [[70, "tensorrt_llm.llmapi.BuildConfig.update", false]], "update() (tensorrt_llm.runtime.samplingconfig method)": [[87, "tensorrt_llm.runtime.SamplingConfig.update", false]], "update_from_dict() (tensorrt_llm.llmapi.buildconfig method)": [[70, "tensorrt_llm.llmapi.BuildConfig.update_from_dict", false]], "update_kv_cache_type() (tensorrt_llm.llmapi.buildconfig method)": [[70, "tensorrt_llm.llmapi.BuildConfig.update_kv_cache_type", false]], "update_output_ids_by_offset() (tensorrt_llm.runtime.generationsession method)": [[87, "tensorrt_llm.runtime.GenerationSession.update_output_ids_by_offset", false]], "update_strategy() (tensorrt_llm.functional.allreduceparams method)": [[82, "tensorrt_llm.functional.AllReduceParams.update_strategy", false]], "use_beam_hyps (tensorrt_llm.runtime.samplingconfig attribute)": [[87, "tensorrt_llm.runtime.SamplingConfig.use_beam_hyps", false]], "use_beam_search (tensorrt_llm.llmapi.samplingparams attribute)": [[70, "tensorrt_llm.llmapi.SamplingParams.use_beam_search", false]], "use_cuda_graph (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.use_cuda_graph", false]], "use_dynamic_tree (tensorrt_llm.llmapi.eagledecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.EagleDecodingConfig.use_dynamic_tree", false]], "use_gemm_allreduce_plugin (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.use_gemm_allreduce_plugin", false]], "use_gpt_attention_plugin (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.use_gpt_attention_plugin", false]], "use_kv_cache (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.use_kv_cache", false]], "use_kv_cache (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.use_kv_cache", false]], "use_lora() (tensorrt_llm.models.decodermodel method)": [[84, "tensorrt_llm.models.DecoderModel.use_lora", false]], "use_lora() (tensorrt_llm.models.encodermodel method)": [[84, "tensorrt_llm.models.EncoderModel.use_lora", false]], "use_lora() (tensorrt_llm.models.gemmaforcausallm method)": [[84, "tensorrt_llm.models.GemmaForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.gptforcausallm method)": [[84, "tensorrt_llm.models.GPTForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.llamaforcausallm method)": [[84, "tensorrt_llm.models.LLaMAForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.mllamaforcausallm method)": [[84, "tensorrt_llm.models.MLLaMAForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.phi3forcausallm method)": [[84, "tensorrt_llm.models.Phi3ForCausalLM.use_lora", false]], "use_lora() (tensorrt_llm.models.phiforcausallm method)": [[84, "tensorrt_llm.models.PhiForCausalLM.use_lora", false]], "use_lora_plugin (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.use_lora_plugin", false]], "use_lora_plugin (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.use_lora_plugin", false]], "use_mamba_conv1d_plugin (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.use_mamba_conv1d_plugin", false]], "use_meta_recipe (tensorrt_llm.llmapi.quantconfig attribute)": [[70, "tensorrt_llm.llmapi.QuantConfig.use_meta_recipe", false]], "use_mrope (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.use_mrope", false]], "use_prompt_tuning() (tensorrt_llm.models.encodermodel method)": [[84, "tensorrt_llm.models.EncoderModel.use_prompt_tuning", false]], "use_refit (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.use_refit", false]], "use_relaxed_acceptance_for_thinking (tensorrt_llm.llmapi.mtpdecodingconfig attribute)": [[70, "tensorrt_llm.llmapi.MTPDecodingConfig.use_relaxed_acceptance_for_thinking", false]], "use_strip_plan (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.use_strip_plan", false]], "validate_cuda_graph_config() (tensorrt_llm.llmapi.torchllmargs method)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_config", false]], "validate_cuda_graph_max_batch_size() (tensorrt_llm.llmapi.torchllmargs class method)": [[70, "tensorrt_llm.llmapi.TorchLlmArgs.validate_cuda_graph_max_batch_size", false]], "validate_positive_values() (tensorrt_llm.llmapi.lookaheaddecodingconfig class method)": [[70, "tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values", false]], "verbatim (tensorrt_llm.models.gemmaconfig attribute)": [[84, "tensorrt_llm.models.GemmaConfig.VERBATIM", false]], "video_preprocess() (tensorrt_llm.runtime.multimodalmodelrunner method)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.video_preprocess", false]], "view() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.view", false]], "view() (tensorrt_llm.functional.tensor method)": [[82, "tensorrt_llm.functional.Tensor.view", false]], "view() (tensorrt_llm.runtime.tensorinfo method)": [[87, "tensorrt_llm.runtime.TensorInfo.view", false]], "visual_engine_dir (tensorrt_llm.runtime.multimodalmodelrunner property)": [[87, "tensorrt_llm.runtime.MultimodalModelRunner.visual_engine_dir", false]], "visualize_network (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.visualize_network", false]], "vocab_size (tensorrt_llm.runtime.generationsession property)": [[87, "tensorrt_llm.runtime.GenerationSession.vocab_size", false]], "vocab_size (tensorrt_llm.runtime.modelconfig attribute)": [[87, "tensorrt_llm.runtime.ModelConfig.vocab_size", false]], "vocab_size (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.vocab_size", false]], "vocab_size (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.vocab_size", false]], "vocab_size_padded (tensorrt_llm.runtime.modelrunner property)": [[87, "tensorrt_llm.runtime.ModelRunner.vocab_size_padded", false]], "vocab_size_padded (tensorrt_llm.runtime.modelrunnercpp property)": [[87, "tensorrt_llm.runtime.ModelRunnerCpp.vocab_size_padded", false]], "w4a16 (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W4A16", false]], "w4a16_awq (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W4A16_AWQ", false]], "w4a16_gptq (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W4A16_GPTQ", false]], "w4a8_awq (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W4A8_AWQ", false]], "w4a8_qserve_per_channel (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_CHANNEL", false]], "w4a8_qserve_per_group (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_GROUP", false]], "w8a16 (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W8A16", false]], "w8a16_gptq (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W8A16_GPTQ", false]], "w8a8_sq_per_channel (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL", false]], "w8a8_sq_per_channel_per_tensor_plugin (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN", false]], "w8a8_sq_per_channel_per_token_plugin (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN", false]], "w8a8_sq_per_tensor_per_token_plugin (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN", false]], "w8a8_sq_per_tensor_plugin (tensorrt_llm.llmapi.quantalgo attribute)": [[70, "tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN", false]], "weight_loader() (tensorrt_llm.layers.attention.deepseekv2attention method)": [[83, "tensorrt_llm.layers.attention.DeepseekV2Attention.weight_loader", false]], "weight_loader() (tensorrt_llm.layers.embedding.embedding method)": [[83, "tensorrt_llm.layers.embedding.Embedding.weight_loader", false]], "weight_loader() (tensorrt_llm.layers.linear.linearbase method)": [[83, "tensorrt_llm.layers.linear.LinearBase.weight_loader", false]], "weight_sparsity (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.weight_sparsity", false]], "weight_streaming (tensorrt_llm.llmapi.buildconfig attribute)": [[70, "tensorrt_llm.llmapi.BuildConfig.weight_streaming", false]], "where() (in module tensorrt_llm.functional)": [[82, "tensorrt_llm.functional.where", false]], "whisperencoder (class in tensorrt_llm.models)": [[84, "tensorrt_llm.models.WhisperEncoder", false]], "workspace (tensorrt_llm.llmapi.llm attribute)": [[70, "tensorrt_llm.llmapi.LLM.workspace", false]], "workspace (tensorrt_llm.llmapi.llm property)": [[70, "id1", false]], "workspace (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "tensorrt_llm.llmapi.TrtLlmArgs.workspace", false]], "wrapped_property (tensorrt_llm.llmapi.torchllmargs attribute)": [[70, "id11", false], [70, "id14", false], [70, "id17", false], [70, "tensorrt_llm.llmapi.TorchLlmArgs.wrapped_property", false]], "wrapped_property (tensorrt_llm.llmapi.trtllmargs attribute)": [[70, "id20", false], [70, "id23", false], [70, "id26", false], [70, "id29", false], [70, "id32", false], [70, "tensorrt_llm.llmapi.TrtLlmArgs.wrapped_property", false]], "yarn (tensorrt_llm.functional.positionembeddingtype attribute)": [[82, "tensorrt_llm.functional.PositionEmbeddingType.yarn", false]], "yarn (tensorrt_llm.functional.rotaryscalingtype attribute)": [[82, "tensorrt_llm.functional.RotaryScalingType.yarn", false]]}, "objects": {"": [[1, 0, 1, "c.FMT_DIM", "FMT_DIM"], [1, 0, 1, "c.SET_FROM_OPTIONAL", "SET_FROM_OPTIONAL"], [1, 1, 1, "_CPPv48nvinfer1", "nvinfer1"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [1, 1, 1, "_CPPv412tensorrt_llm", "tensorrt_llm"], [0, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [1, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [1, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [1, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [1, 1, 1, "_CPPv4N12tensorrt_llm13batch_managerE", "tensorrt_llm::batch_manager"], [0, 1, 1, "_CPPv4N12tensorrt_llm13batch_manager16kv_cache_managerE", "tensorrt_llm::batch_manager::kv_cache_manager"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executorE", "tensorrt_llm::executor"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutputE", "tensorrt_llm::executor::AdditionalModelOutput"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput21AdditionalModelOutputENSt6stringEb", "tensorrt_llm::executor::AdditionalModelOutput::AdditionalModelOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput21AdditionalModelOutputENSt6stringEb", "tensorrt_llm::executor::AdditionalModelOutput::AdditionalModelOutput::gatherContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput21AdditionalModelOutputENSt6stringEb", "tensorrt_llm::executor::AdditionalModelOutput::AdditionalModelOutput::name"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput13gatherContextE", "tensorrt_llm::executor::AdditionalModelOutput::gatherContext"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21AdditionalModelOutput4nameE", "tensorrt_llm::executor::AdditionalModelOutput::name"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor21AdditionalModelOutputeqERK21AdditionalModelOutput", "tensorrt_llm::executor::AdditionalModelOutput::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor21AdditionalModelOutputeqERK21AdditionalModelOutput", "tensorrt_llm::executor::AdditionalModelOutput::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputE", "tensorrt_llm::executor::AdditionalOutput"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputENSt6stringE6Tensor", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERK16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERR16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputENSt6stringE6Tensor", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput::name"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERK16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputERR16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput16AdditionalOutputENSt6stringE6Tensor", "tensorrt_llm::executor::AdditionalOutput::AdditionalOutput::output"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput4nameE", "tensorrt_llm::executor::AdditionalOutput::name"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERK16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERR16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERK16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputaSERR16AdditionalOutput", "tensorrt_llm::executor::AdditionalOutput::operator=::other"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutput6outputE", "tensorrt_llm::executor::AdditionalOutput::output"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor16AdditionalOutputD0Ev", "tensorrt_llm::executor::AdditionalOutput::~AdditionalOutput"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor12BatchingTypeE", "tensorrt_llm::executor::BatchingType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12BatchingType9kINFLIGHTE", "tensorrt_llm::executor::BatchingType::kINFLIGHT"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12BatchingType7kSTATICE", "tensorrt_llm::executor::BatchingType::kSTATIC"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor10BeamTokensE", "tensorrt_llm::executor::BeamTokens"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor10BufferViewE", "tensorrt_llm::executor::BufferView"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfigE", "tensorrt_llm::executor::CacheTransceiverConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig22CacheTransceiverConfigENSt8optionalI6size_tEE", "tensorrt_llm::executor::CacheTransceiverConfig::CacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig22CacheTransceiverConfigENSt8optionalI6size_tEE", "tensorrt_llm::executor::CacheTransceiverConfig::CacheTransceiverConfig::maxNumTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfig15getMaxNumTokensEv", "tensorrt_llm::executor::CacheTransceiverConfig::getMaxNumTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig13mMaxNumTokensE", "tensorrt_llm::executor::CacheTransceiverConfig::mMaxNumTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfigeqERK22CacheTransceiverConfig", "tensorrt_llm::executor::CacheTransceiverConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22CacheTransceiverConfigeqERK22CacheTransceiverConfig", "tensorrt_llm::executor::CacheTransceiverConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig15setMaxNumTokensE6size_t", "tensorrt_llm::executor::CacheTransceiverConfig::setMaxNumTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22CacheTransceiverConfig15setMaxNumTokensE6size_t", "tensorrt_llm::executor::CacheTransceiverConfig::setMaxNumTokens::maxNumTokens"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicyE", "tensorrt_llm::executor::CapacitySchedulerPolicy"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy20kGUARANTEED_NO_EVICTE", "tensorrt_llm::executor::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy16kMAX_UTILIZATIONE", "tensorrt_llm::executor::CapacitySchedulerPolicy::kMAX_UTILIZATION"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor23CapacitySchedulerPolicy13kSTATIC_BATCHE", "tensorrt_llm::executor::CapacitySchedulerPolicy::kSTATIC_BATCH"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationModeE", "tensorrt_llm::executor::CommunicationMode"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationMode7kLEADERE", "tensorrt_llm::executor::CommunicationMode::kLEADER"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationMode13kORCHESTRATORE", "tensorrt_llm::executor::CommunicationMode::kORCHESTRATOR"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationTypeE", "tensorrt_llm::executor::CommunicationType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor17CommunicationType4kMPIE", "tensorrt_llm::executor::CommunicationType::kMPI"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicyE", "tensorrt_llm::executor::ContextChunkingPolicy"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicy15kEQUAL_PROGRESSE", "tensorrt_llm::executor::ContextChunkingPolicy::kEQUAL_PROGRESS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor21ContextChunkingPolicy24kFIRST_COME_FIRST_SERVEDE", "tensorrt_llm::executor::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsE", "tensorrt_llm::executor::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsERK18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsERR18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::draftTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::draftTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::draftTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::firstGenTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::firstGenTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::firstGenTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::reqId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::reqId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::reqId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypeRKNSt6vectorIcEENSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::serializedState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams18ContextPhaseParamsE9VecTokens13RequestIdTypePvNSt8optionalI9VecTokensEE", "tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams::state"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams13RequestIdTypeE", "tensorrt_llm::executor::ContextPhaseParams::RequestIdType"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams8StatePtrE", "tensorrt_llm::executor::ContextPhaseParams::StatePtr"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams7deleterEPKv", "tensorrt_llm::executor::ContextPhaseParams::deleter"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams7deleterEPKv", "tensorrt_llm::executor::ContextPhaseParams::deleter::data"], [0, 3, 1, "_CPPv4NKR12tensorrt_llm8executor18ContextPhaseParams14getDraftTokensEv", "tensorrt_llm::executor::ContextPhaseParams::getDraftTokens"], [0, 3, 1, "_CPPv4NKR12tensorrt_llm8executor18ContextPhaseParams17getFirstGenTokensEv", "tensorrt_llm::executor::ContextPhaseParams::getFirstGenTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams8getReqIdEv", "tensorrt_llm::executor::ContextPhaseParams::getReqId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams18getSerializedStateEv", "tensorrt_llm::executor::ContextPhaseParams::getSerializedState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams8getStateEv", "tensorrt_llm::executor::ContextPhaseParams::getState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParams8getStateEv", "tensorrt_llm::executor::ContextPhaseParams::getState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams12mDraftTokensE", "tensorrt_llm::executor::ContextPhaseParams::mDraftTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams15mFirstGenTokensE", "tensorrt_llm::executor::ContextPhaseParams::mFirstGenTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams6mReqIdE", "tensorrt_llm::executor::ContextPhaseParams::mReqId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams6mStateE", "tensorrt_llm::executor::ContextPhaseParams::mState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsaSERK18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsaSERR18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::operator="], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18ContextPhaseParamseqERK18ContextPhaseParams", "tensorrt_llm::executor::ContextPhaseParams::operator=="], [0, 3, 1, "_CPPv4NO12tensorrt_llm8executor18ContextPhaseParams17popFirstGenTokensEv", "tensorrt_llm::executor::ContextPhaseParams::popFirstGenTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParams12releaseStateEv", "tensorrt_llm::executor::ContextPhaseParams::releaseState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18ContextPhaseParamsD0Ev", "tensorrt_llm::executor::ContextPhaseParams::~ContextPhaseParams"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverStateE", "tensorrt_llm::executor::DataTransceiverState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEN8kv_cache10CacheStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::DataTransceiverState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEv", "tensorrt_llm::executor::DataTransceiverState::DataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEN8kv_cache10CacheStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::DataTransceiverState::cacheState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState20DataTransceiverStateEN8kv_cache10CacheStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::DataTransceiverState::commState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState13getCacheStateEv", "tensorrt_llm::executor::DataTransceiverState::getCacheState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState12getCommStateEv", "tensorrt_llm::executor::DataTransceiverState::getCommState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState11mCacheStateE", "tensorrt_llm::executor::DataTransceiverState::mCacheState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState10mCommStateE", "tensorrt_llm::executor::DataTransceiverState::mCommState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverStateeqERK20DataTransceiverState", "tensorrt_llm::executor::DataTransceiverState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverStateeqERK20DataTransceiverState", "tensorrt_llm::executor::DataTransceiverState::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState13setCacheStateEN8kv_cache10CacheStateE", "tensorrt_llm::executor::DataTransceiverState::setCacheState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState13setCacheStateEN8kv_cache10CacheStateE", "tensorrt_llm::executor::DataTransceiverState::setCacheState::state"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState12setCommStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::setCommState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20DataTransceiverState12setCommStateEN8kv_cache9CommStateE", "tensorrt_llm::executor::DataTransceiverState::setCommState::state"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20DataTransceiverState8toStringEv", "tensorrt_llm::executor::DataTransceiverState::toString"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor8DataTypeE", "tensorrt_llm::executor::DataType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kBF16E", "tensorrt_llm::executor::DataType::kBF16"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kBOOLE", "tensorrt_llm::executor::DataType::kBOOL"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kFP16E", "tensorrt_llm::executor::DataType::kFP16"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kFP32E", "tensorrt_llm::executor::DataType::kFP32"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType4kFP8E", "tensorrt_llm::executor::DataType::kFP8"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType6kINT32E", "tensorrt_llm::executor::DataType::kINT32"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType6kINT64E", "tensorrt_llm::executor::DataType::kINT64"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType5kINT8E", "tensorrt_llm::executor::DataType::kINT8"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType6kUINT8E", "tensorrt_llm::executor::DataType::kUINT8"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8DataType8kUNKNOWNE", "tensorrt_llm::executor::DataType::kUNKNOWN"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfigE", "tensorrt_llm::executor::DebugConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig::debugInputTensors"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig::debugOutputTensors"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig::debugTensorNames"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig11DebugConfigEbb9StringVec10SizeType32", "tensorrt_llm::executor::DebugConfig::DebugConfig::debugTensorsMaxIterations"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig9StringVecE", "tensorrt_llm::executor::DebugConfig::StringVec"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfig20getDebugInputTensorsEv", "tensorrt_llm::executor::DebugConfig::getDebugInputTensors"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfig21getDebugOutputTensorsEv", "tensorrt_llm::executor::DebugConfig::getDebugOutputTensors"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfig19getDebugTensorNamesEv", "tensorrt_llm::executor::DebugConfig::getDebugTensorNames"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfig28getDebugTensorsMaxIterationsEv", "tensorrt_llm::executor::DebugConfig::getDebugTensorsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig18mDebugInputTensorsE", "tensorrt_llm::executor::DebugConfig::mDebugInputTensors"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig19mDebugOutputTensorsE", "tensorrt_llm::executor::DebugConfig::mDebugOutputTensors"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig17mDebugTensorNamesE", "tensorrt_llm::executor::DebugConfig::mDebugTensorNames"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig26mDebugTensorsMaxIterationsE", "tensorrt_llm::executor::DebugConfig::mDebugTensorsMaxIterations"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfigeqERK11DebugConfig", "tensorrt_llm::executor::DebugConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor11DebugConfigeqERK11DebugConfig", "tensorrt_llm::executor::DebugConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig20setDebugInputTensorsEb", "tensorrt_llm::executor::DebugConfig::setDebugInputTensors"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig20setDebugInputTensorsEb", "tensorrt_llm::executor::DebugConfig::setDebugInputTensors::debugInputTensors"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig21setDebugOutputTensorsEb", "tensorrt_llm::executor::DebugConfig::setDebugOutputTensors"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig21setDebugOutputTensorsEb", "tensorrt_llm::executor::DebugConfig::setDebugOutputTensors::debugOutputTensors"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig19setDebugTensorNamesERK9StringVec", "tensorrt_llm::executor::DebugConfig::setDebugTensorNames"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig19setDebugTensorNamesERK9StringVec", "tensorrt_llm::executor::DebugConfig::setDebugTensorNames::debugTensorNames"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig28setDebugTensorsMaxIterationsE10SizeType32", "tensorrt_llm::executor::DebugConfig::setDebugTensorsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11DebugConfig28setDebugTensorsMaxIterationsE10SizeType32", "tensorrt_llm::executor::DebugConfig::setDebugTensorsMaxIterations::debugTensorsMaxIterations"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIterationE", "tensorrt_llm::executor::DebugTensorsPerIteration"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIteration12debugTensorsE", "tensorrt_llm::executor::DebugTensorsPerIteration::debugTensors"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor24DebugTensorsPerIteration4iterE", "tensorrt_llm::executor::DebugTensorsPerIteration::iter"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfigE", "tensorrt_llm::executor::DecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig::decodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig::eagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig::lookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14DecodingConfigENSt8optionalI12DecodingModeEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI13MedusaChoicesEENSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::DecodingConfig::DecodingConfig::medusaChoices"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig31enableSeamlessLookaheadDecodingEv", "tensorrt_llm::executor::DecodingConfig::enableSeamlessLookaheadDecoding"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig15getDecodingModeEv", "tensorrt_llm::executor::DecodingConfig::getDecodingMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig14getEagleConfigEv", "tensorrt_llm::executor::DecodingConfig::getEagleConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig26getLookaheadDecodingConfigEv", "tensorrt_llm::executor::DecodingConfig::getLookaheadDecodingConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig33getLookaheadDecodingMaxNumRequestEv", "tensorrt_llm::executor::DecodingConfig::getLookaheadDecodingMaxNumRequest"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfig16getMedusaChoicesEv", "tensorrt_llm::executor::DecodingConfig::getMedusaChoices"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig13mDecodingModeE", "tensorrt_llm::executor::DecodingConfig::mDecodingMode"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig12mEagleConfigE", "tensorrt_llm::executor::DecodingConfig::mEagleConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig24mLookaheadDecodingConfigE", "tensorrt_llm::executor::DecodingConfig::mLookaheadDecodingConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig31mLookaheadDecodingMaxNumRequestE", "tensorrt_llm::executor::DecodingConfig::mLookaheadDecodingMaxNumRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14mMedusaChoicesE", "tensorrt_llm::executor::DecodingConfig::mMedusaChoices"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfigeqERK14DecodingConfig", "tensorrt_llm::executor::DecodingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor14DecodingConfigeqERK14DecodingConfig", "tensorrt_llm::executor::DecodingConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig15setDecodingModeERK12DecodingMode", "tensorrt_llm::executor::DecodingConfig::setDecodingMode"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig14setEagleConfigERK11EagleConfig", "tensorrt_llm::executor::DecodingConfig::setEagleConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig26setLookaheadDecodingConfigERK23LookaheadDecodingConfig", "tensorrt_llm::executor::DecodingConfig::setLookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig26setLookaheadDecodingConfigERK23LookaheadDecodingConfig", "tensorrt_llm::executor::DecodingConfig::setLookaheadDecodingConfig::lookaheadDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14DecodingConfig16setMedusaChoicesERK13MedusaChoices", "tensorrt_llm::executor::DecodingConfig::setMedusaChoices"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12DecodingModeE", "tensorrt_llm::executor::DecodingMode"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode4AutoEv", "tensorrt_llm::executor::DecodingMode::Auto"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode10BeamSearchEv", "tensorrt_llm::executor::DecodingMode::BeamSearch"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12DecodingModeE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::DecodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12DecodingModeE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::DecodingMode::state"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode5EagleEv", "tensorrt_llm::executor::DecodingMode::Eagle"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19ExplicitDraftTokensEv", "tensorrt_llm::executor::DecodingMode::ExplicitDraftTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19ExternalDraftTokensEv", "tensorrt_llm::executor::DecodingMode::ExternalDraftTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode9LookaheadEv", "tensorrt_llm::executor::DecodingMode::Lookahead"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode6MedusaEv", "tensorrt_llm::executor::DecodingMode::Medusa"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode4TopKEv", "tensorrt_llm::executor::DecodingMode::TopK"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8TopKTopPEv", "tensorrt_llm::executor::DecodingMode::TopKTopP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode4TopPEv", "tensorrt_llm::executor::DecodingMode::TopP"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode14UnderlyingTypeE", "tensorrt_llm::executor::DecodingMode::UnderlyingType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9allBitSetE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::allBitSet"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9allBitSetE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::allBitSet::bits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9anyBitSetE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::anyBitSet"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9anyBitSetE14UnderlyingType", "tensorrt_llm::executor::DecodingMode::anyBitSet::bits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode7getNameEv", "tensorrt_llm::executor::DecodingMode::getName"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode8getStateEv", "tensorrt_llm::executor::DecodingMode::getState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isAutoEv", "tensorrt_llm::executor::DecodingMode::isAuto"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isBeamSearchEv", "tensorrt_llm::executor::DecodingMode::isBeamSearch"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode7isEagleEv", "tensorrt_llm::executor::DecodingMode::isEagle"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isExplicitDraftTokensEv", "tensorrt_llm::executor::DecodingMode::isExplicitDraftTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isExternalDraftTokensEv", "tensorrt_llm::executor::DecodingMode::isExternalDraftTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode11isLookaheadEv", "tensorrt_llm::executor::DecodingMode::isLookahead"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode8isMedusaEv", "tensorrt_llm::executor::DecodingMode::isMedusa"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isTopKEv", "tensorrt_llm::executor::DecodingMode::isTopK"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode13isTopKandTopPEv", "tensorrt_llm::executor::DecodingMode::isTopKandTopP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isTopKorTopPEv", "tensorrt_llm::executor::DecodingMode::isTopKorTopP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode6isTopPEv", "tensorrt_llm::executor::DecodingMode::isTopP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseBanTokensEv", "tensorrt_llm::executor::DecodingMode::isUseBanTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode13isUseBanWordsEv", "tensorrt_llm::executor::DecodingMode::isUseBanWords"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode20isUseExplicitEosStopEv", "tensorrt_llm::executor::DecodingMode::isUseExplicitEosStop"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode21isUseFrequencyPenaltyEv", "tensorrt_llm::executor::DecodingMode::isUseFrequencyPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode18isUseMaxLengthStopEv", "tensorrt_llm::executor::DecodingMode::isUseMaxLengthStop"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseMinLengthEv", "tensorrt_llm::executor::DecodingMode::isUseMinLength"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode9isUseMinPEv", "tensorrt_llm::executor::DecodingMode::isUseMinP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseNoRepeatNgramSizeEv", "tensorrt_llm::executor::DecodingMode::isUseNoRepeatNgramSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseOccurrencePenaltyEv", "tensorrt_llm::executor::DecodingMode::isUseOccurrencePenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode12isUsePenaltyEv", "tensorrt_llm::executor::DecodingMode::isUsePenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode20isUsePresencePenaltyEv", "tensorrt_llm::executor::DecodingMode::isUsePresencePenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode22isUseRepetitionPenaltyEv", "tensorrt_llm::executor::DecodingMode::isUseRepetitionPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode17isUseStopCriteriaEv", "tensorrt_llm::executor::DecodingMode::isUseStopCriteria"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode14isUseStopWordsEv", "tensorrt_llm::executor::DecodingMode::isUseStopWords"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode16isUseTemperatureEv", "tensorrt_llm::executor::DecodingMode::isUseTemperature"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingMode28isUseVariableBeamWidthSearchEv", "tensorrt_llm::executor::DecodingMode::isUseVariableBeamWidthSearch"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kAutoE", "tensorrt_llm::executor::DecodingMode::kAuto"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode11kBeamSearchE", "tensorrt_llm::executor::DecodingMode::kBeamSearch"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode6kEagleE", "tensorrt_llm::executor::DecodingMode::kEagle"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20kExplicitDraftTokensE", "tensorrt_llm::executor::DecodingMode::kExplicitDraftTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20kExternalDraftTokensE", "tensorrt_llm::executor::DecodingMode::kExternalDraftTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode10kLookaheadE", "tensorrt_llm::executor::DecodingMode::kLookahead"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode7kMedusaE", "tensorrt_llm::executor::DecodingMode::kMedusa"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode9kNumFlagsE", "tensorrt_llm::executor::DecodingMode::kNumFlags"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kTopKE", "tensorrt_llm::executor::DecodingMode::kTopK"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode9kTopKTopPE", "tensorrt_llm::executor::DecodingMode::kTopKTopP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode5kTopPE", "tensorrt_llm::executor::DecodingMode::kTopP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseBanTokensE", "tensorrt_llm::executor::DecodingMode::kUseBanTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12kUseBanWordsE", "tensorrt_llm::executor::DecodingMode::kUseBanWords"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19kUseExplicitEosStopE", "tensorrt_llm::executor::DecodingMode::kUseExplicitEosStop"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode22kUseFrequencyPenaltiesE", "tensorrt_llm::executor::DecodingMode::kUseFrequencyPenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode17kUseMaxLengthStopE", "tensorrt_llm::executor::DecodingMode::kUseMaxLengthStop"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseMinLengthE", "tensorrt_llm::executor::DecodingMode::kUseMinLength"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8kUseMinPE", "tensorrt_llm::executor::DecodingMode::kUseMinP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode21kUseNoRepeatNgramSizeE", "tensorrt_llm::executor::DecodingMode::kUseNoRepeatNgramSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode23kUseOccurrencePenaltiesE", "tensorrt_llm::executor::DecodingMode::kUseOccurrencePenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUsePenaltiesE", "tensorrt_llm::executor::DecodingMode::kUsePenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode21kUsePresencePenaltiesE", "tensorrt_llm::executor::DecodingMode::kUsePresencePenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode23kUseRepetitionPenaltiesE", "tensorrt_llm::executor::DecodingMode::kUseRepetitionPenalties"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode24kUseStandardStopCriteriaE", "tensorrt_llm::executor::DecodingMode::kUseStandardStopCriteria"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode13kUseStopWordsE", "tensorrt_llm::executor::DecodingMode::kUseStopWords"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode15kUseTemperatureE", "tensorrt_llm::executor::DecodingMode::kUseTemperature"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode27kUseVariableBeamWidthSearchE", "tensorrt_llm::executor::DecodingMode::kUseVariableBeamWidthSearch"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode6mStateE", "tensorrt_llm::executor::DecodingMode::mState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingModeeqERK12DecodingMode", "tensorrt_llm::executor::DecodingMode::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor12DecodingModeeqERK12DecodingMode", "tensorrt_llm::executor::DecodingMode::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8setBitToE14UnderlyingTypeb", "tensorrt_llm::executor::DecodingMode::setBitTo"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8setBitToE14UnderlyingTypeb", "tensorrt_llm::executor::DecodingMode::setBitTo::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode8setBitToE14UnderlyingTypeb", "tensorrt_llm::executor::DecodingMode::setBitTo::x"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useBanTokensEb", "tensorrt_llm::executor::DecodingMode::useBanTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useBanTokensEb", "tensorrt_llm::executor::DecodingMode::useBanTokens::banTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode11useBanWordsEb", "tensorrt_llm::executor::DecodingMode::useBanWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode11useBanWordsEb", "tensorrt_llm::executor::DecodingMode::useBanWords::banWords"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode18useExplicitEosStopEb", "tensorrt_llm::executor::DecodingMode::useExplicitEosStop"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode18useExplicitEosStopEb", "tensorrt_llm::executor::DecodingMode::useExplicitEosStop::explicitEosStop"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19useFrequencyPenaltyEb", "tensorrt_llm::executor::DecodingMode::useFrequencyPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode19useFrequencyPenaltyEb", "tensorrt_llm::executor::DecodingMode::useFrequencyPenalty::usePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode16useMaxLengthStopEb", "tensorrt_llm::executor::DecodingMode::useMaxLengthStop"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode16useMaxLengthStopEb", "tensorrt_llm::executor::DecodingMode::useMaxLengthStop::maxLengthStop"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useMinLengthEb", "tensorrt_llm::executor::DecodingMode::useMinLength"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useMinLengthEb", "tensorrt_llm::executor::DecodingMode::useMinLength::useMinLen"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode7useMinPEb", "tensorrt_llm::executor::DecodingMode::useMinP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode7useMinPEb", "tensorrt_llm::executor::DecodingMode::useMinP::useMinP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useNoRepeatNgramSizeEb", "tensorrt_llm::executor::DecodingMode::useNoRepeatNgramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useNoRepeatNgramSizeEb", "tensorrt_llm::executor::DecodingMode::useNoRepeatNgramSize::noRepeatNgramSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode22useOccurrencePenaltiesEb", "tensorrt_llm::executor::DecodingMode::useOccurrencePenalties"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode22useOccurrencePenaltiesEb", "tensorrt_llm::executor::DecodingMode::useOccurrencePenalties::usePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode18usePresencePenaltyEb", "tensorrt_llm::executor::DecodingMode::usePresencePenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode18usePresencePenaltyEb", "tensorrt_llm::executor::DecodingMode::usePresencePenalty::usePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useRepetitionPenaltyEb", "tensorrt_llm::executor::DecodingMode::useRepetitionPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode20useRepetitionPenaltyEb", "tensorrt_llm::executor::DecodingMode::useRepetitionPenalty::usePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useStopWordsEb", "tensorrt_llm::executor::DecodingMode::useStopWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode12useStopWordsEb", "tensorrt_llm::executor::DecodingMode::useStopWords::stopWords"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode14useTemperatureEb", "tensorrt_llm::executor::DecodingMode::useTemperature"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode14useTemperatureEb", "tensorrt_llm::executor::DecodingMode::useTemperature::useTemp"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode26useVariableBeamWidthSearchEb", "tensorrt_llm::executor::DecodingMode::useVariableBeamWidthSearch"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12DecodingMode26useVariableBeamWidthSearchEb", "tensorrt_llm::executor::DecodingMode::useVariableBeamWidthSearch::useVariableBeamWidthSearch"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStatsE", "tensorrt_llm::executor::DisServingRequestStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStats11kvCacheSizeE", "tensorrt_llm::executor::DisServingRequestStats::kvCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22DisServingRequestStats17kvCacheTransferMSE", "tensorrt_llm::executor::DisServingRequestStats::kvCacheTransferMS"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfigE", "tensorrt_llm::executor::DynamicBatchConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig::batchSizeTable"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig::dynamicBatchMovingAverageWindow"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig::enableBatchSizeTuning"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig18DynamicBatchConfigEbb10SizeType32NSt6vectorINSt4pairI10SizeType3210SizeType32EEEE", "tensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig::enableMaxNumTokensTuning"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig17getBatchSizeTableEv", "tensorrt_llm::executor::DynamicBatchConfig::getBatchSizeTable"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig34getDynamicBatchMovingAverageWindowEv", "tensorrt_llm::executor::DynamicBatchConfig::getDynamicBatchMovingAverageWindow"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig24getEnableBatchSizeTuningEv", "tensorrt_llm::executor::DynamicBatchConfig::getEnableBatchSizeTuning"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18DynamicBatchConfig27getEnableMaxNumTokensTuningEv", "tensorrt_llm::executor::DynamicBatchConfig::getEnableMaxNumTokensTuning"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig22kDefaultBatchSizeTableE", "tensorrt_llm::executor::DynamicBatchConfig::kDefaultBatchSizeTable"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig39kDefaultDynamicBatchMovingAverageWindowE", "tensorrt_llm::executor::DynamicBatchConfig::kDefaultDynamicBatchMovingAverageWindow"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig15mBatchSizeTableE", "tensorrt_llm::executor::DynamicBatchConfig::mBatchSizeTable"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig32mDynamicBatchMovingAverageWindowE", "tensorrt_llm::executor::DynamicBatchConfig::mDynamicBatchMovingAverageWindow"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig22mEnableBatchSizeTuningE", "tensorrt_llm::executor::DynamicBatchConfig::mEnableBatchSizeTuning"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18DynamicBatchConfig25mEnableMaxNumTokensTuningE", "tensorrt_llm::executor::DynamicBatchConfig::mEnableMaxNumTokensTuning"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor12EagleChoicesE", "tensorrt_llm::executor::EagleChoices"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfigE", "tensorrt_llm::executor::EagleConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::dynamicTreeMaxTopK"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::eagleChoices"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::greedySampling"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::posteriorThreshold"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig11EagleConfigENSt8optionalI12EagleChoicesEEbNSt8optionalIfEEbNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::EagleConfig::EagleConfig::useDynamicTree"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig19checkPosteriorValueERKNSt8optionalIfEE", "tensorrt_llm::executor::EagleConfig::checkPosteriorValue"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig19checkPosteriorValueERKNSt8optionalIfEE", "tensorrt_llm::executor::EagleConfig::checkPosteriorValue::value"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig21getDynamicTreeMaxTopKEv", "tensorrt_llm::executor::EagleConfig::getDynamicTreeMaxTopK"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig15getEagleChoicesEv", "tensorrt_llm::executor::EagleConfig::getEagleChoices"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig21getPosteriorThresholdEv", "tensorrt_llm::executor::EagleConfig::getPosteriorThreshold"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig16isGreedySamplingEv", "tensorrt_llm::executor::EagleConfig::isGreedySampling"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig19mDynamicTreeMaxTopKE", "tensorrt_llm::executor::EagleConfig::mDynamicTreeMaxTopK"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig13mEagleChoicesE", "tensorrt_llm::executor::EagleConfig::mEagleChoices"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig15mGreedySamplingE", "tensorrt_llm::executor::EagleConfig::mGreedySampling"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig19mPosteriorThresholdE", "tensorrt_llm::executor::EagleConfig::mPosteriorThreshold"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11EagleConfig15mUseDynamicTreeE", "tensorrt_llm::executor::EagleConfig::mUseDynamicTree"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfigeqERK11EagleConfig", "tensorrt_llm::executor::EagleConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfigeqERK11EagleConfig", "tensorrt_llm::executor::EagleConfig::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11EagleConfig14useDynamicTreeEv", "tensorrt_llm::executor::EagleConfig::useDynamicTree"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8ExecutorE", "tensorrt_llm::executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK8Executor", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERR8Executor", "tensorrt_llm::executor::Executor::Executor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::decoderEngineBuffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::decoderJsonConfigStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::decoderModel"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::decoderModelPath"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::encoderEngineBuffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::encoderJsonConfigStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::encoderModel"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::encoderModelPath"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::engineBuffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK8Executor", "tensorrt_llm::executor::Executor::Executor::executor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::jsonConfigStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::managedWeights"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorENSt10shared_ptrI5ModelEERK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::model"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::modelPath"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfigRKNSt8optionalINSt3mapINSt6stringE6TensorEEEE", "tensorrt_llm::executor::Executor::Executor::modelType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERK10BufferViewRKNSt6stringERK10BufferViewRKNSt6stringE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::modelType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::modelType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor8ExecutorERKNSt10filesystem4pathERKNSt10filesystem4pathE9ModelTypeRK14ExecutorConfig", "tensorrt_llm::executor::Executor::Executor::modelType"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERK6IdTypeRKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt6vectorI6IdTypeEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERK6IdTypeRKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::requestId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt6vectorI6IdTypeEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::requestIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERK6IdTypeRKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::timeout"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt6vectorI6IdTypeEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::timeout"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14awaitResponsesERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::Executor::awaitResponses::timeout"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Executor18canEnqueueRequestsEv", "tensorrt_llm::executor::Executor::canEnqueueRequests"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor13cancelRequestE6IdType", "tensorrt_llm::executor::Executor::cancelRequest"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor13cancelRequestE6IdType", "tensorrt_llm::executor::Executor::cancelRequest::requestId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor14enqueueRequestERK7Request", "tensorrt_llm::executor::Executor::enqueueRequest"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor14enqueueRequestERK7Request", "tensorrt_llm::executor::Executor::enqueueRequest::request"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor15enqueueRequestsERKNSt6vectorI7RequestEE", "tensorrt_llm::executor::Executor::enqueueRequests"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Executor15enqueueRequestsERKNSt6vectorI7RequestEE", "tensorrt_llm::executor::Executor::enqueueRequests::requests"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Executor22getKVCacheEventManagerEv", "tensorrt_llm::executor::Executor::getKVCacheEventManager"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor21getLatestDebugTensorsEv", "tensorrt_llm::executor::Executor::getLatestDebugTensors"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor23getLatestIterationStatsEv", "tensorrt_llm::executor::Executor::getLatestIterationStats"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor21getLatestRequestStatsEv", "tensorrt_llm::executor::Executor::getLatestRequestStats"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Executor20getNumResponsesReadyERKNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Executor::getNumResponsesReady"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8Executor20getNumResponsesReadyERKNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Executor::getNumResponsesReady::requestId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Executor13isParticipantEv", "tensorrt_llm::executor::Executor::isParticipant"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8Executor5mImplE", "tensorrt_llm::executor::Executor::mImpl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERK8Executor", "tensorrt_llm::executor::Executor::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERR8Executor", "tensorrt_llm::executor::Executor::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8ExecutoraSERK8Executor", "tensorrt_llm::executor::Executor::operator=::executor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Executor8shutdownEv", "tensorrt_llm::executor::Executor::shutdown"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ExecutorD0Ev", "tensorrt_llm::executor::Executor::~Executor"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfigE", "tensorrt_llm::executor::ExecutorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::additionalModelOutputs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::batchingType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::cacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::debugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::decodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::enableChunkedContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::enableTrtOverlap"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::extendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::gatherGenerationLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::gpuWeightsPercent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::guidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::iterStatsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::kvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::logitsPostProcessorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxBatchSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxBeamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxNumTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxQueueSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::maxSeqIdleMicroseconds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::normalizeLogProbs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::parallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::peftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::promptTableOffloading"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::recvPollPeriodMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::requestStatsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::schedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::specDecConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14ExecutorConfigE10SizeType3215SchedulerConfig13KvCacheConfigbb10SizeType3210SizeType3212BatchingTypeNSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI14ParallelConfigEERKNSt8optionalI15PeftCacheConfigEENSt8optionalI25LogitsPostProcessorConfigEENSt8optionalI14DecodingConfigEEbfNSt8optionalI10SizeType32EERK29ExtendedRuntimePerfKnobConfigNSt8optionalI11DebugConfigEE10SizeType328uint64_tNSt8optionalI25SpeculativeDecodingConfigEENSt8optionalI20GuidedDecodingConfigEENSt8optionalINSt6vectorI21AdditionalModelOutputEEEENSt8optionalI22CacheTransceiverConfigEEbbb", "tensorrt_llm::executor::ExecutorConfig::ExecutorConfig::useGpuDirectStorage"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getAdditionalModelOutputsEv", "tensorrt_llm::executor::ExecutorConfig::getAdditionalModelOutputs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getBatchingTypeEv", "tensorrt_llm::executor::ExecutorConfig::getBatchingType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getCacheTransceiverConfigEv", "tensorrt_llm::executor::ExecutorConfig::getCacheTransceiverConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig14getDebugConfigEv", "tensorrt_llm::executor::ExecutorConfig::getDebugConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig17getDecodingConfigEv", "tensorrt_llm::executor::ExecutorConfig::getDecodingConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig23getEnableChunkedContextEv", "tensorrt_llm::executor::ExecutorConfig::getEnableChunkedContext"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig19getEnableTrtOverlapEv", "tensorrt_llm::executor::ExecutorConfig::getEnableTrtOverlap"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig32getExtendedRuntimePerfKnobConfigEv", "tensorrt_llm::executor::ExecutorConfig::getExtendedRuntimePerfKnobConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getGatherGenerationLogitsEv", "tensorrt_llm::executor::ExecutorConfig::getGatherGenerationLogits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig20getGpuWeightsPercentEv", "tensorrt_llm::executor::ExecutorConfig::getGpuWeightsPercent"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig23getGuidedDecodingConfigEv", "tensorrt_llm::executor::ExecutorConfig::getGuidedDecodingConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getIterStatsMaxIterationsEv", "tensorrt_llm::executor::ExecutorConfig::getIterStatsMaxIterations"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig16getKvCacheConfigEv", "tensorrt_llm::executor::ExecutorConfig::getKvCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19getKvCacheConfigRefEv", "tensorrt_llm::executor::ExecutorConfig::getKvCacheConfigRef"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig28getLogitsPostProcessorConfigEv", "tensorrt_llm::executor::ExecutorConfig::getLogitsPostProcessorConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxBatchSizeEv", "tensorrt_llm::executor::ExecutorConfig::getMaxBatchSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxBeamWidthEv", "tensorrt_llm::executor::ExecutorConfig::getMaxBeamWidth"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxNumTokensEv", "tensorrt_llm::executor::ExecutorConfig::getMaxNumTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig15getMaxQueueSizeEv", "tensorrt_llm::executor::ExecutorConfig::getMaxQueueSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig25getMaxSeqIdleMicrosecondsEv", "tensorrt_llm::executor::ExecutorConfig::getMaxSeqIdleMicroseconds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig20getNormalizeLogProbsEv", "tensorrt_llm::executor::ExecutorConfig::getNormalizeLogProbs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig17getParallelConfigEv", "tensorrt_llm::executor::ExecutorConfig::getParallelConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig18getPeftCacheConfigEv", "tensorrt_llm::executor::ExecutorConfig::getPeftCacheConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig24getPromptTableOffloadingEv", "tensorrt_llm::executor::ExecutorConfig::getPromptTableOffloading"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig19getRecvPollPeriodMsEv", "tensorrt_llm::executor::ExecutorConfig::getRecvPollPeriodMs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig28getRequestStatsMaxIterationsEv", "tensorrt_llm::executor::ExecutorConfig::getRequestStatsMaxIterations"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig18getSchedulerConfigEv", "tensorrt_llm::executor::ExecutorConfig::getSchedulerConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21getSchedulerConfigRefEv", "tensorrt_llm::executor::ExecutorConfig::getSchedulerConfigRef"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig16getSpecDecConfigEv", "tensorrt_llm::executor::ExecutorConfig::getSpecDecConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ExecutorConfig22getUseGpuDirectStorageEv", "tensorrt_llm::executor::ExecutorConfig::getUseGpuDirectStorage"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30kDefaultIterStatsMaxIterationsE", "tensorrt_llm::executor::ExecutorConfig::kDefaultIterStatsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30kDefaultMaxSeqIdleMicrosecondsE", "tensorrt_llm::executor::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig33kDefaultRequestStatsMaxIterationsE", "tensorrt_llm::executor::ExecutorConfig::kDefaultRequestStatsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mAdditionalModelOutputsE", "tensorrt_llm::executor::ExecutorConfig::mAdditionalModelOutputs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mBatchingTypeE", "tensorrt_llm::executor::ExecutorConfig::mBatchingType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mCacheTransceiverConfigE", "tensorrt_llm::executor::ExecutorConfig::mCacheTransceiverConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig12mDebugConfigE", "tensorrt_llm::executor::ExecutorConfig::mDebugConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15mDecodingConfigE", "tensorrt_llm::executor::ExecutorConfig::mDecodingConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21mEnableChunkedContextE", "tensorrt_llm::executor::ExecutorConfig::mEnableChunkedContext"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17mEnableTrtOverlapE", "tensorrt_llm::executor::ExecutorConfig::mEnableTrtOverlap"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig30mExtendedRuntimePerfKnobConfigE", "tensorrt_llm::executor::ExecutorConfig::mExtendedRuntimePerfKnobConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mGatherGenerationLogitsE", "tensorrt_llm::executor::ExecutorConfig::mGatherGenerationLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18mGpuWeightsPercentE", "tensorrt_llm::executor::ExecutorConfig::mGpuWeightsPercent"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig21mGuidedDecodingConfigE", "tensorrt_llm::executor::ExecutorConfig::mGuidedDecodingConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mIterStatsMaxIterationsE", "tensorrt_llm::executor::ExecutorConfig::mIterStatsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14mKvCacheConfigE", "tensorrt_llm::executor::ExecutorConfig::mKvCacheConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mLogitsPostProcessorConfigE", "tensorrt_llm::executor::ExecutorConfig::mLogitsPostProcessorConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxBatchSizeE", "tensorrt_llm::executor::ExecutorConfig::mMaxBatchSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxBeamWidthE", "tensorrt_llm::executor::ExecutorConfig::mMaxBeamWidth"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxNumTokensE", "tensorrt_llm::executor::ExecutorConfig::mMaxNumTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig13mMaxQueueSizeE", "tensorrt_llm::executor::ExecutorConfig::mMaxQueueSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23mMaxSeqIdleMicrosecondsE", "tensorrt_llm::executor::ExecutorConfig::mMaxSeqIdleMicroseconds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18mNormalizeLogProbsE", "tensorrt_llm::executor::ExecutorConfig::mNormalizeLogProbs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15mParallelConfigE", "tensorrt_llm::executor::ExecutorConfig::mParallelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16mPeftCacheConfigE", "tensorrt_llm::executor::ExecutorConfig::mPeftCacheConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22mPromptTableOffloadingE", "tensorrt_llm::executor::ExecutorConfig::mPromptTableOffloading"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17mRecvPollPeriodMsE", "tensorrt_llm::executor::ExecutorConfig::mRecvPollPeriodMs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mRequestStatsMaxIterationsE", "tensorrt_llm::executor::ExecutorConfig::mRequestStatsMaxIterations"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16mSchedulerConfigE", "tensorrt_llm::executor::ExecutorConfig::mSchedulerConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig26mSpeculativeDecodingConfigE", "tensorrt_llm::executor::ExecutorConfig::mSpeculativeDecodingConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20mUseGpuDirectStorageE", "tensorrt_llm::executor::ExecutorConfig::mUseGpuDirectStorage"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setAdditionalModelOutputsERKNSt6vectorI21AdditionalModelOutputEE", "tensorrt_llm::executor::ExecutorConfig::setAdditionalModelOutputs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setAdditionalModelOutputsERKNSt6vectorI21AdditionalModelOutputEE", "tensorrt_llm::executor::ExecutorConfig::setAdditionalModelOutputs::additionalModelOutputs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setBatchingTypeE12BatchingType", "tensorrt_llm::executor::ExecutorConfig::setBatchingType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setBatchingTypeE12BatchingType", "tensorrt_llm::executor::ExecutorConfig::setBatchingType::batchingType"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setCacheTransceiverConfigERK22CacheTransceiverConfig", "tensorrt_llm::executor::ExecutorConfig::setCacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setCacheTransceiverConfigERK22CacheTransceiverConfig", "tensorrt_llm::executor::ExecutorConfig::setCacheTransceiverConfig::cacheTransceiverConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14setDebugConfigERK11DebugConfig", "tensorrt_llm::executor::ExecutorConfig::setDebugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig14setDebugConfigERK11DebugConfig", "tensorrt_llm::executor::ExecutorConfig::setDebugConfig::debugConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setDecodingConfigERK14DecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setDecodingConfigERK14DecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setDecodingConfig::decodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setEnableChunkedContextEb", "tensorrt_llm::executor::ExecutorConfig::setEnableChunkedContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setEnableChunkedContextEb", "tensorrt_llm::executor::ExecutorConfig::setEnableChunkedContext::enableChunkedContext"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setEnableTrtOverlapEb", "tensorrt_llm::executor::ExecutorConfig::setEnableTrtOverlap"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setEnableTrtOverlapEb", "tensorrt_llm::executor::ExecutorConfig::setEnableTrtOverlap::enableTrtOverlap"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig32setExtendedRuntimePerfKnobConfigERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::ExecutorConfig::setExtendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig32setExtendedRuntimePerfKnobConfigERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::ExecutorConfig::setExtendedRuntimePerfKnobConfig::extendedRuntimePerfKnobConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setGatherGenerationLogitsEb", "tensorrt_llm::executor::ExecutorConfig::setGatherGenerationLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setGatherGenerationLogitsEb", "tensorrt_llm::executor::ExecutorConfig::setGatherGenerationLogits::gatherGenerationLogits"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setGpuWeightsPercentERKf", "tensorrt_llm::executor::ExecutorConfig::setGpuWeightsPercent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setGpuWeightsPercentERKf", "tensorrt_llm::executor::ExecutorConfig::setGpuWeightsPercent::gpuWeightsPercent"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setGuidedDecodingConfigERK20GuidedDecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setGuidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig23setGuidedDecodingConfigERK20GuidedDecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setGuidedDecodingConfig::guidedDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setIterStatsMaxIterationsE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setIterStatsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setIterStatsMaxIterationsE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setIterStatsMaxIterations::iterStatsMaxIterations"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setKvCacheConfigERK13KvCacheConfig", "tensorrt_llm::executor::ExecutorConfig::setKvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setKvCacheConfigERK13KvCacheConfig", "tensorrt_llm::executor::ExecutorConfig::setKvCacheConfig::kvCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setLogitsPostProcessorConfigERK25LogitsPostProcessorConfig", "tensorrt_llm::executor::ExecutorConfig::setLogitsPostProcessorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setLogitsPostProcessorConfigERK25LogitsPostProcessorConfig", "tensorrt_llm::executor::ExecutorConfig::setLogitsPostProcessorConfig::logitsPostProcessorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBatchSizeE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxBatchSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBatchSizeE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxBatchSize::maxBatchSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBeamWidthE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxBeamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxBeamWidthE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxBeamWidth::maxBeamWidth"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxNumTokensE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxNumTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxNumTokensE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setMaxNumTokens::maxNumTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxQueueSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ExecutorConfig::setMaxQueueSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig15setMaxQueueSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ExecutorConfig::setMaxQueueSize::maxQueueSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setMaxSeqIdleMicrosecondsE8uint64_t", "tensorrt_llm::executor::ExecutorConfig::setMaxSeqIdleMicroseconds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig25setMaxSeqIdleMicrosecondsE8uint64_t", "tensorrt_llm::executor::ExecutorConfig::setMaxSeqIdleMicroseconds::maxSeqIdleMicroseconds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setNormalizeLogProbsEb", "tensorrt_llm::executor::ExecutorConfig::setNormalizeLogProbs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig20setNormalizeLogProbsEb", "tensorrt_llm::executor::ExecutorConfig::setNormalizeLogProbs::normalizeLogProbs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setParallelConfigERK14ParallelConfig", "tensorrt_llm::executor::ExecutorConfig::setParallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig17setParallelConfigERK14ParallelConfig", "tensorrt_llm::executor::ExecutorConfig::setParallelConfig::parallelConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setPeftCacheConfigERK15PeftCacheConfig", "tensorrt_llm::executor::ExecutorConfig::setPeftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setPeftCacheConfigERK15PeftCacheConfig", "tensorrt_llm::executor::ExecutorConfig::setPeftCacheConfig::peftCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig24setPromptTableOffloadingEb", "tensorrt_llm::executor::ExecutorConfig::setPromptTableOffloading"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig24setPromptTableOffloadingEb", "tensorrt_llm::executor::ExecutorConfig::setPromptTableOffloading::promptTableOffloading"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setRecvPollPeriodMsERK10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setRecvPollPeriodMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig19setRecvPollPeriodMsERK10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setRecvPollPeriodMs::recvPollPeriodMs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setRequestStatsMaxIterationsE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setRequestStatsMaxIterations"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig28setRequestStatsMaxIterationsE10SizeType32", "tensorrt_llm::executor::ExecutorConfig::setRequestStatsMaxIterations::requestStatsMaxIterations"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setSchedulerConfigERK15SchedulerConfig", "tensorrt_llm::executor::ExecutorConfig::setSchedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig18setSchedulerConfigERK15SchedulerConfig", "tensorrt_llm::executor::ExecutorConfig::setSchedulerConfig::schedulerConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setSpecDecConfigERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setSpecDecConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig16setSpecDecConfigERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::ExecutorConfig::setSpecDecConfig::specDecConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22setUseGpuDirectStorageERKb", "tensorrt_llm::executor::ExecutorConfig::setUseGpuDirectStorage"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ExecutorConfig22setUseGpuDirectStorageERKb", "tensorrt_llm::executor::ExecutorConfig::setUseGpuDirectStorage::useGpuDirectStorage"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig::cudaGraphCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig::cudaGraphMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig::enableContextFMHAFP32Acc"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig29ExtendedRuntimePerfKnobConfigEbbb10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig::multiBlockMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21getCudaGraphCacheSizeEv", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getCudaGraphCacheSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16getCudaGraphModeEv", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getCudaGraphMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27getEnableContextFMHAFP32AccEv", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getEnableContextFMHAFP32Acc"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17getMultiBlockModeEv", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getMultiBlockMode"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig19mCudaGraphCacheSizeE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mCudaGraphCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig14mCudaGraphModeE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mCudaGraphMode"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig25mEnableContextFMHAFP32AccE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mEnableContextFMHAFP32Acc"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig15mMultiBlockModeE", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mMultiBlockMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigeqERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfigeqERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21setCudaGraphCacheSizeE10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig21setCudaGraphCacheSizeE10SizeType32", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize::cacheSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16setCudaGraphModeEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig16setCudaGraphModeEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphMode::cudaGraphMode"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27setEnableContextFMHAFP32AccEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig27setEnableContextFMHAFP32AccEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc::enableContextFMHAFP32Acc"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17setMultiBlockModeEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setMultiBlockMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor29ExtendedRuntimePerfKnobConfig17setMultiBlockModeEb", "tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setMultiBlockMode::multiBlockMode"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfigE", "tensorrt_llm::executor::ExternalDraftTokensConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig::acceptanceThreshold"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig::fastLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig::logits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig25ExternalDraftTokensConfigE9VecTokensNSt8optionalI6TensorEERKNSt8optionalI9FloatTypeEERKNSt8optionalIbEE", "tensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig::tokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig22getAcceptanceThresholdEv", "tensorrt_llm::executor::ExternalDraftTokensConfig::getAcceptanceThreshold"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig13getFastLogitsEv", "tensorrt_llm::executor::ExternalDraftTokensConfig::getFastLogits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig9getLogitsEv", "tensorrt_llm::executor::ExternalDraftTokensConfig::getLogits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25ExternalDraftTokensConfig9getTokensEv", "tensorrt_llm::executor::ExternalDraftTokensConfig::getTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig20mAcceptanceThresholdE", "tensorrt_llm::executor::ExternalDraftTokensConfig::mAcceptanceThreshold"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig11mFastLogitsE", "tensorrt_llm::executor::ExternalDraftTokensConfig::mFastLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig7mLogitsE", "tensorrt_llm::executor::ExternalDraftTokensConfig::mLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25ExternalDraftTokensConfig7mTokensE", "tensorrt_llm::executor::ExternalDraftTokensConfig::mTokens"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor12FinishReasonE", "tensorrt_llm::executor::FinishReason"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason10kCANCELLEDE", "tensorrt_llm::executor::FinishReason::kCANCELLED"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason7kEND_IDE", "tensorrt_llm::executor::FinishReason::kEND_ID"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason7kLENGTHE", "tensorrt_llm::executor::FinishReason::kLENGTH"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason13kNOT_FINISHEDE", "tensorrt_llm::executor::FinishReason::kNOT_FINISHED"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason11kSTOP_WORDSE", "tensorrt_llm::executor::FinishReason::kSTOP_WORDS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12FinishReason10kTIMED_OUTE", "tensorrt_llm::executor::FinishReason::kTIMED_OUT"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor9FloatTypeE", "tensorrt_llm::executor::FloatType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfigE", "tensorrt_llm::executor::GuidedDecodingConfig"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackendE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingBackend"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig21GuidedDecodingBackend9kXGRAMMARE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig::backend"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig::encodedVocab"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig::stopTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig20GuidedDecodingConfigE21GuidedDecodingBackendNSt8optionalINSt6vectorINSt6stringEEEEENSt8optionalINSt6stringEEENSt8optionalINSt6vectorI11TokenIdTypeEEEE", "tensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig::tokenizerStr"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig10getBackendEv", "tensorrt_llm::executor::GuidedDecodingConfig::getBackend"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getEncodedVocabEv", "tensorrt_llm::executor::GuidedDecodingConfig::getEncodedVocab"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getStopTokenIdsEv", "tensorrt_llm::executor::GuidedDecodingConfig::getStopTokenIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig15getTokenizerStrEv", "tensorrt_llm::executor::GuidedDecodingConfig::getTokenizerStr"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig8mBackendE", "tensorrt_llm::executor::GuidedDecodingConfig::mBackend"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mEncodedVocabE", "tensorrt_llm::executor::GuidedDecodingConfig::mEncodedVocab"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mStopTokenIdsE", "tensorrt_llm::executor::GuidedDecodingConfig::mStopTokenIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig13mTokenizerStrE", "tensorrt_llm::executor::GuidedDecodingConfig::mTokenizerStr"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfigeqERK20GuidedDecodingConfig", "tensorrt_llm::executor::GuidedDecodingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfigeqERK20GuidedDecodingConfig", "tensorrt_llm::executor::GuidedDecodingConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig10setBackendERK21GuidedDecodingBackend", "tensorrt_llm::executor::GuidedDecodingConfig::setBackend"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig10setBackendERK21GuidedDecodingBackend", "tensorrt_llm::executor::GuidedDecodingConfig::setBackend::backend"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setEncodedVocabERKNSt6vectorINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingConfig::setEncodedVocab"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setEncodedVocabERKNSt6vectorINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingConfig::setEncodedVocab::encodedVocab"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setStopTokenIdsERKNSt6vectorI11TokenIdTypeEE", "tensorrt_llm::executor::GuidedDecodingConfig::setStopTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setStopTokenIdsERKNSt6vectorI11TokenIdTypeEE", "tensorrt_llm::executor::GuidedDecodingConfig::setStopTokenIds::stopTokenIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setTokenizerStrERKNSt6stringE", "tensorrt_llm::executor::GuidedDecodingConfig::setTokenizerStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingConfig15setTokenizerStrERKNSt6stringE", "tensorrt_llm::executor::GuidedDecodingConfig::setTokenizerStr::tokenizerStr"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingConfig8validateEv", "tensorrt_llm::executor::GuidedDecodingConfig::validate"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParamsE", "tensorrt_llm::executor::GuidedDecodingParams"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideTypeE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType13kEBNF_GRAMMARE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType5kJSONE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kJSON"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType12kJSON_SCHEMAE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kJSON_SCHEMA"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType6kREGEXE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kREGEX"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams9GuideType15kSTRUCTURAL_TAGE", "tensorrt_llm::executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams20GuidedDecodingParamsE9GuideTypeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingParams::GuidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams20GuidedDecodingParamsE9GuideTypeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingParams::GuidedDecodingParams::guide"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams20GuidedDecodingParamsE9GuideTypeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::GuidedDecodingParams::GuidedDecodingParams::guideType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParams8getGuideEv", "tensorrt_llm::executor::GuidedDecodingParams::getGuide"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParams12getGuideTypeEv", "tensorrt_llm::executor::GuidedDecodingParams::getGuideType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams6mGuideE", "tensorrt_llm::executor::GuidedDecodingParams::mGuide"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor20GuidedDecodingParams10mGuideTypeE", "tensorrt_llm::executor::GuidedDecodingParams::mGuideType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParamseqERK20GuidedDecodingParams", "tensorrt_llm::executor::GuidedDecodingParams::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor20GuidedDecodingParamseqERK20GuidedDecodingParams", "tensorrt_llm::executor::GuidedDecodingParams::operator==::other"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6IdTypeE", "tensorrt_llm::executor::IdType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStatsE", "tensorrt_llm::executor::InflightBatchingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats26avgNumDecodedTokensPerIterE", "tensorrt_llm::executor::InflightBatchingStats::avgNumDecodedTokensPerIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats12microBatchIdE", "tensorrt_llm::executor::InflightBatchingStats::microBatchId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats18numContextRequestsE", "tensorrt_llm::executor::InflightBatchingStats::numContextRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats12numCtxTokensE", "tensorrt_llm::executor::InflightBatchingStats::numCtxTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats14numGenRequestsE", "tensorrt_llm::executor::InflightBatchingStats::numGenRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats17numPausedRequestsE", "tensorrt_llm::executor::InflightBatchingStats::numPausedRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor21InflightBatchingStats20numScheduledRequestsE", "tensorrt_llm::executor::InflightBatchingStats::numScheduledRequests"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14IterationStatsE", "tensorrt_llm::executor::IterationStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats11cpuMemUsageE", "tensorrt_llm::executor::IterationStats::cpuMemUsage"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats17crossKvCacheStatsE", "tensorrt_llm::executor::IterationStats::crossKvCacheStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats11gpuMemUsageE", "tensorrt_llm::executor::IterationStats::gpuMemUsage"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats21inflightBatchingStatsE", "tensorrt_llm::executor::IterationStats::inflightBatchingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats4iterE", "tensorrt_llm::executor::IterationStats::iter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats13iterLatencyMSE", "tensorrt_llm::executor::IterationStats::iterLatencyMS"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats12kvCacheStatsE", "tensorrt_llm::executor::IterationStats::kvCacheStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats19maxBatchSizeRuntimeE", "tensorrt_llm::executor::IterationStats::maxBatchSizeRuntime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats18maxBatchSizeStaticE", "tensorrt_llm::executor::IterationStats::maxBatchSizeStatic"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats28maxBatchSizeTunerRecommendedE", "tensorrt_llm::executor::IterationStats::maxBatchSizeTunerRecommended"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats20maxNumActiveRequestsE", "tensorrt_llm::executor::IterationStats::maxNumActiveRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats19maxNumTokensRuntimeE", "tensorrt_llm::executor::IterationStats::maxNumTokensRuntime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats18maxNumTokensStaticE", "tensorrt_llm::executor::IterationStats::maxNumTokensStatic"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats28maxNumTokensTunerRecommendedE", "tensorrt_llm::executor::IterationStats::maxNumTokensTunerRecommended"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats31newActiveRequestsQueueLatencyMSE", "tensorrt_llm::executor::IterationStats::newActiveRequestsQueueLatencyMS"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats17numActiveRequestsE", "tensorrt_llm::executor::IterationStats::numActiveRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats20numCompletedRequestsE", "tensorrt_llm::executor::IterationStats::numCompletedRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats20numNewActiveRequestsE", "tensorrt_llm::executor::IterationStats::numNewActiveRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats17numQueuedRequestsE", "tensorrt_llm::executor::IterationStats::numQueuedRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats14pinnedMemUsageE", "tensorrt_llm::executor::IterationStats::pinnedMemUsage"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats12specDecStatsE", "tensorrt_llm::executor::IterationStats::specDecStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats19staticBatchingStatsE", "tensorrt_llm::executor::IterationStats::staticBatchingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14IterationStats9timestampE", "tensorrt_llm::executor::IterationStats::timestamp"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor13IterationTypeE", "tensorrt_llm::executor::IterationType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerializationE", "tensorrt_llm::executor::JsonSerialization"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK12RequestStats", "tensorrt_llm::executor::JsonSerialization::toJsonStr"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK14IterationStats", "tensorrt_llm::executor::JsonSerialization::toJsonStr"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK24RequestStatsPerIteration", "tensorrt_llm::executor::JsonSerialization::toJsonStr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK14IterationStats", "tensorrt_llm::executor::JsonSerialization::toJsonStr::iterationStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK12RequestStats", "tensorrt_llm::executor::JsonSerialization::toJsonStr::requestStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor17JsonSerialization9toJsonStrERK24RequestStatsPerIteration", "tensorrt_llm::executor::JsonSerialization::toJsonStr::requestStatsPerIter"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheCreatedDataE", "tensorrt_llm::executor::KVCacheCreatedData"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheCreatedData22numBlocksPerCacheLevelE", "tensorrt_llm::executor::KVCacheCreatedData::numBlocksPerCacheLevel"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEventE", "tensorrt_llm::executor::KVCacheEvent"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent12KVCacheEventE6IdType16KVCacheEventData", "tensorrt_llm::executor::KVCacheEvent::KVCacheEvent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent12KVCacheEventE6IdType16KVCacheEventData", "tensorrt_llm::executor::KVCacheEvent::KVCacheEvent::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent12KVCacheEventE6IdType16KVCacheEventData", "tensorrt_llm::executor::KVCacheEvent::KVCacheEvent::eventId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent4dataE", "tensorrt_llm::executor::KVCacheEvent::data"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KVCacheEvent7eventIdE", "tensorrt_llm::executor::KVCacheEvent::eventId"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDataE", "tensorrt_llm::executor::KVCacheEventData"], [0, 2, 1, "_CPPv4I0EN12tensorrt_llm8executor16KVCacheEventDiffE", "tensorrt_llm::executor::KVCacheEventDiff"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor16KVCacheEventDiffE", "tensorrt_llm::executor::KVCacheEventDiff::T"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDiff8newValueE", "tensorrt_llm::executor::KVCacheEventDiff::newValue"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor16KVCacheEventDiff8oldValueE", "tensorrt_llm::executor::KVCacheEventDiff::oldValue"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManagerE", "tensorrt_llm::executor::KVCacheEventManager"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager19KVCacheEventManagerENSt10shared_ptrIN12tensorrt_llm13batch_manager16kv_cache_manager18BaseKVCacheManagerEEE", "tensorrt_llm::executor::KVCacheEventManager::KVCacheEventManager"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager19KVCacheEventManagerENSt10shared_ptrIN12tensorrt_llm13batch_manager16kv_cache_manager18BaseKVCacheManagerEEE", "tensorrt_llm::executor::KVCacheEventManager::KVCacheEventManager::kvCacheManager"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager15getLatestEventsENSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KVCacheEventManager::getLatestEvents"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager15getLatestEventsENSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KVCacheEventManager::getLatestEvents::timeout"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19KVCacheEventManager14kvCacheManagerE", "tensorrt_llm::executor::KVCacheEventManager::kvCacheManager"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheRemovedDataE", "tensorrt_llm::executor::KVCacheRemovedData"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheRemovedData11blockHashesE", "tensorrt_llm::executor::KVCacheRemovedData::blockHashes"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockDataE", "tensorrt_llm::executor::KVCacheStoredBlockData"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::blockHash"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::cacheLevel"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::loraId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::priority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData22KVCacheStoredBlockDataE6IdTypeN12tensorrt_llm7runtime15VecUniqueTokensENSt8optionalIN12tensorrt_llm7runtime14LoraTaskIdTypeEEE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData::tokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData9blockHashE", "tensorrt_llm::executor::KVCacheStoredBlockData::blockHash"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData10cacheLevelE", "tensorrt_llm::executor::KVCacheStoredBlockData::cacheLevel"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData6loraIdE", "tensorrt_llm::executor::KVCacheStoredBlockData::loraId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData8priorityE", "tensorrt_llm::executor::KVCacheStoredBlockData::priority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KVCacheStoredBlockData6tokensE", "tensorrt_llm::executor::KVCacheStoredBlockData::tokens"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredDataE", "tensorrt_llm::executor::KVCacheStoredData"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredData6blocksE", "tensorrt_llm::executor::KVCacheStoredData::blocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17KVCacheStoredData10parentHashE", "tensorrt_llm::executor::KVCacheStoredData::parentHash"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedDataE", "tensorrt_llm::executor::KVCacheUpdatedData"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData18KVCacheUpdatedDataE6IdType", "tensorrt_llm::executor::KVCacheUpdatedData::KVCacheUpdatedData"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData18KVCacheUpdatedDataE6IdType", "tensorrt_llm::executor::KVCacheUpdatedData::KVCacheUpdatedData::blockHash"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData9blockHashE", "tensorrt_llm::executor::KVCacheUpdatedData::blockHash"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData10cacheLevelE", "tensorrt_llm::executor::KVCacheUpdatedData::cacheLevel"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData17cacheLevelUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::cacheLevelUpdated"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData17cacheLevelUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::cacheLevelUpdated::newValue"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData17cacheLevelUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::cacheLevelUpdated::oldValue"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData8priorityE", "tensorrt_llm::executor::KVCacheUpdatedData::priority"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData15priorityUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::priorityUpdated"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData15priorityUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::priorityUpdated::newValue"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18KVCacheUpdatedData15priorityUpdatedE10SizeType3210SizeType32", "tensorrt_llm::executor::KVCacheUpdatedData::priorityUpdated::oldValue"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfigE", "tensorrt_llm::executor::KvCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::copyOnPartialReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::crossKvCacheFraction"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::enableBlockReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::enablePartialReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::eventBufferMaxSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::freeGpuMemoryFraction"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::hostCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::maxAttentionWindowVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::maxTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::onboardBlocks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::runtimeDefaults"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::secondaryOffloadMinPriority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig13KvCacheConfigEbRKNSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI6size_tEEbRKNSt8optionalI9FloatTypeEENSt8optionalI17RetentionPriorityEE6size_tRKNSt8optionalIN12tensorrt_llm7runtime15RuntimeDefaultsEEEbb", "tensorrt_llm::executor::KvCacheConfig::KvCacheConfig::sinkTokenLength"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig34fillEmptyFieldsFromRuntimeDefaultsEN12tensorrt_llm7runtime15RuntimeDefaultsE", "tensorrt_llm::executor::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig34fillEmptyFieldsFromRuntimeDefaultsEN12tensorrt_llm7runtime15RuntimeDefaultsE", "tensorrt_llm::executor::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults::runtimeDefaults"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getCopyOnPartialReuseEv", "tensorrt_llm::executor::KvCacheConfig::getCopyOnPartialReuse"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig23getCrossKvCacheFractionEv", "tensorrt_llm::executor::KvCacheConfig::getCrossKvCacheFraction"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig19getEnableBlockReuseEv", "tensorrt_llm::executor::KvCacheConfig::getEnableBlockReuse"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getEnablePartialReuseEv", "tensorrt_llm::executor::KvCacheConfig::getEnablePartialReuse"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig21getEventBufferMaxSizeEv", "tensorrt_llm::executor::KvCacheConfig::getEventBufferMaxSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig24getFreeGpuMemoryFractionEv", "tensorrt_llm::executor::KvCacheConfig::getFreeGpuMemoryFraction"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig16getHostCacheSizeEv", "tensorrt_llm::executor::KvCacheConfig::getHostCacheSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig24getMaxAttentionWindowVecEv", "tensorrt_llm::executor::KvCacheConfig::getMaxAttentionWindowVec"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig12getMaxTokensEv", "tensorrt_llm::executor::KvCacheConfig::getMaxTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig16getOnboardBlocksEv", "tensorrt_llm::executor::KvCacheConfig::getOnboardBlocks"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig30getSecondaryOffloadMinPriorityEv", "tensorrt_llm::executor::KvCacheConfig::getSecondaryOffloadMinPriority"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor13KvCacheConfig18getSinkTokenLengthEv", "tensorrt_llm::executor::KvCacheConfig::getSinkTokenLength"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mCopyOnPartialReuseE", "tensorrt_llm::executor::KvCacheConfig::mCopyOnPartialReuse"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21mCrossKvCacheFractionE", "tensorrt_llm::executor::KvCacheConfig::mCrossKvCacheFraction"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig17mEnableBlockReuseE", "tensorrt_llm::executor::KvCacheConfig::mEnableBlockReuse"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mEnablePartialReuseE", "tensorrt_llm::executor::KvCacheConfig::mEnablePartialReuse"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19mEventBufferMaxSizeE", "tensorrt_llm::executor::KvCacheConfig::mEventBufferMaxSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig22mFreeGpuMemoryFractionE", "tensorrt_llm::executor::KvCacheConfig::mFreeGpuMemoryFraction"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig14mHostCacheSizeE", "tensorrt_llm::executor::KvCacheConfig::mHostCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig22mMaxAttentionWindowVecE", "tensorrt_llm::executor::KvCacheConfig::mMaxAttentionWindowVec"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig10mMaxTokensE", "tensorrt_llm::executor::KvCacheConfig::mMaxTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig14mOnboardBlocksE", "tensorrt_llm::executor::KvCacheConfig::mOnboardBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig28mSecondaryOffloadMinPriorityE", "tensorrt_llm::executor::KvCacheConfig::mSecondaryOffloadMinPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16mSinkTokenLengthE", "tensorrt_llm::executor::KvCacheConfig::mSinkTokenLength"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setCopyOnPartialReuseEb", "tensorrt_llm::executor::KvCacheConfig::setCopyOnPartialReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setCopyOnPartialReuseEb", "tensorrt_llm::executor::KvCacheConfig::setCopyOnPartialReuse::copyOnPartialReuse"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig23setCrossKvCacheFractionE9FloatType", "tensorrt_llm::executor::KvCacheConfig::setCrossKvCacheFraction"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig23setCrossKvCacheFractionE9FloatType", "tensorrt_llm::executor::KvCacheConfig::setCrossKvCacheFraction::crossKvCacheFraction"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19setEnableBlockReuseEb", "tensorrt_llm::executor::KvCacheConfig::setEnableBlockReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig19setEnableBlockReuseEb", "tensorrt_llm::executor::KvCacheConfig::setEnableBlockReuse::enableBlockReuse"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEnablePartialReuseEb", "tensorrt_llm::executor::KvCacheConfig::setEnablePartialReuse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEnablePartialReuseEb", "tensorrt_llm::executor::KvCacheConfig::setEnablePartialReuse::enablePartialReuse"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEventBufferMaxSizeE6size_t", "tensorrt_llm::executor::KvCacheConfig::setEventBufferMaxSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig21setEventBufferMaxSizeE6size_t", "tensorrt_llm::executor::KvCacheConfig::setEventBufferMaxSize::eventBufferMaxSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setFreeGpuMemoryFractionE9FloatType", "tensorrt_llm::executor::KvCacheConfig::setFreeGpuMemoryFraction"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setFreeGpuMemoryFractionE9FloatType", "tensorrt_llm::executor::KvCacheConfig::setFreeGpuMemoryFraction::freeGpuMemoryFraction"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setHostCacheSizeE6size_t", "tensorrt_llm::executor::KvCacheConfig::setHostCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setHostCacheSizeE6size_t", "tensorrt_llm::executor::KvCacheConfig::setHostCacheSize::hostCacheSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setMaxAttentionWindowVecENSt6vectorI10SizeType32EE", "tensorrt_llm::executor::KvCacheConfig::setMaxAttentionWindowVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig24setMaxAttentionWindowVecENSt6vectorI10SizeType32EE", "tensorrt_llm::executor::KvCacheConfig::setMaxAttentionWindowVec::maxAttentionWindowVec"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig12setMaxTokensE10SizeType32", "tensorrt_llm::executor::KvCacheConfig::setMaxTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig12setMaxTokensE10SizeType32", "tensorrt_llm::executor::KvCacheConfig::setMaxTokens::maxTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setOnboardBlocksEb", "tensorrt_llm::executor::KvCacheConfig::setOnboardBlocks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig16setOnboardBlocksEb", "tensorrt_llm::executor::KvCacheConfig::setOnboardBlocks::onboardBlocks"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig30setSecondaryOffloadMinPriorityENSt8optionalI17RetentionPriorityEE", "tensorrt_llm::executor::KvCacheConfig::setSecondaryOffloadMinPriority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig30setSecondaryOffloadMinPriorityENSt8optionalI17RetentionPriorityEE", "tensorrt_llm::executor::KvCacheConfig::setSecondaryOffloadMinPriority::secondaryOffloadMinPriority"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig18setSinkTokenLengthE10SizeType32", "tensorrt_llm::executor::KvCacheConfig::setSinkTokenLength"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13KvCacheConfig18setSinkTokenLengthE10SizeType32", "tensorrt_llm::executor::KvCacheConfig::setSinkTokenLength::sinkTokenLength"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfigE", "tensorrt_llm::executor::KvCacheRetentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigEv", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig::decodeDurationMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig::decodeRetentionPriority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig::directory"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig::tokenRangeRetentionPriorities"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig22KvCacheRetentionConfigERKNSt6vectorI25TokenRangeRetentionConfigEE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE19KvCacheTransferModeNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig::transferMode"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig::durationMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig::priority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig::tokenEnd"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig25TokenRangeRetentionConfigE10SizeType32NSt8optionalI10SizeType32EE17RetentionPriorityNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::TokenRangeRetentionConfig::tokenStart"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig10durationMsE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::durationMs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigeqERK25TokenRangeRetentionConfig", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfigeqERK25TokenRangeRetentionConfig", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::operator==::other"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig8priorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::priority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig8tokenEndE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenEnd"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25TokenRangeRetentionConfig10tokenStartE", "tensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenStart"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig19getDecodeDurationMsEv", "tensorrt_llm::executor::KvCacheRetentionConfig::getDecodeDurationMs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig26getDecodeRetentionPriorityEv", "tensorrt_llm::executor::KvCacheRetentionConfig::getDecodeRetentionPriority"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig12getDirectoryEv", "tensorrt_llm::executor::KvCacheRetentionConfig::getDirectory"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32", "tensorrt_llm::executor::KvCacheRetentionConfig::getPerBlockRetentionPriorityDuration"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32", "tensorrt_llm::executor::KvCacheRetentionConfig::getPerBlockRetentionPriorityDuration::blockSize"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig36getPerBlockRetentionPriorityDurationE10SizeType3210SizeType32", "tensorrt_llm::executor::KvCacheRetentionConfig::getPerBlockRetentionPriorityDuration::seqLen"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig29getTokenRangeRetentionConfigsEv", "tensorrt_llm::executor::KvCacheRetentionConfig::getTokenRangeRetentionConfigs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfig15getTransferModeEv", "tensorrt_llm::executor::KvCacheRetentionConfig::getTransferMode"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig25kDefaultRetentionPriorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::kDefaultRetentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMaxRetentionPriorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::kMaxRetentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig21kMinRetentionPriorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::kMinRetentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig17mDecodeDurationMsE", "tensorrt_llm::executor::KvCacheRetentionConfig::mDecodeDurationMs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig24mDecodeRetentionPriorityE", "tensorrt_llm::executor::KvCacheRetentionConfig::mDecodeRetentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig10mDirectoryE", "tensorrt_llm::executor::KvCacheRetentionConfig::mDirectory"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig27mTokenRangeRetentionConfigsE", "tensorrt_llm::executor::KvCacheRetentionConfig::mTokenRangeRetentionConfigs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor22KvCacheRetentionConfig13mTransferModeE", "tensorrt_llm::executor::KvCacheRetentionConfig::mTransferMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfigeqERK22KvCacheRetentionConfig", "tensorrt_llm::executor::KvCacheRetentionConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor22KvCacheRetentionConfigeqERK22KvCacheRetentionConfig", "tensorrt_llm::executor::KvCacheRetentionConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStatsE", "tensorrt_llm::executor::KvCacheStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats14allocNewBlocksE", "tensorrt_llm::executor::KvCacheStats::allocNewBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats16allocTotalBlocksE", "tensorrt_llm::executor::KvCacheStats::allocTotalBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12cacheHitRateE", "tensorrt_llm::executor::KvCacheStats::cacheHitRate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats13freeNumBlocksE", "tensorrt_llm::executor::KvCacheStats::freeNumBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12maxNumBlocksE", "tensorrt_llm::executor::KvCacheStats::maxNumBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12missedBlocksE", "tensorrt_llm::executor::KvCacheStats::missedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats12reusedBlocksE", "tensorrt_llm::executor::KvCacheStats::reusedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats14tokensPerBlockE", "tensorrt_llm::executor::KvCacheStats::tokensPerBlock"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12KvCacheStats13usedNumBlocksE", "tensorrt_llm::executor::KvCacheStats::usedNumBlocks"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor19KvCacheTransferModeE", "tensorrt_llm::executor::KvCacheTransferMode"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode4DRAME", "tensorrt_llm::executor::KvCacheTransferMode::DRAM"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode3GDSE", "tensorrt_llm::executor::KvCacheTransferMode::GDS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor19KvCacheTransferMode20POSIX_DEBUG_FALLBACKE", "tensorrt_llm::executor::KvCacheTransferMode::POSIX_DEBUG_FALLBACK"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor19LogitsPostProcessorE", "tensorrt_llm::executor::LogitsPostProcessor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor26LogitsPostProcessorBatchedE", "tensorrt_llm::executor::LogitsPostProcessorBatched"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfigE", "tensorrt_llm::executor::LogitsPostProcessorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::LogitsPostProcessorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::LogitsPostProcessorConfig::processorBatched"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::LogitsPostProcessorConfig::processorMap"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig25LogitsPostProcessorConfigENSt8optionalI22LogitsPostProcessorMapEENSt8optionalI26LogitsPostProcessorBatchedEEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::LogitsPostProcessorConfig::replicate"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig19getProcessorBatchedEv", "tensorrt_llm::executor::LogitsPostProcessorConfig::getProcessorBatched"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig15getProcessorMapEv", "tensorrt_llm::executor::LogitsPostProcessorConfig::getProcessorMap"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25LogitsPostProcessorConfig12getReplicateEv", "tensorrt_llm::executor::LogitsPostProcessorConfig::getReplicate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig17mProcessorBatchedE", "tensorrt_llm::executor::LogitsPostProcessorConfig::mProcessorBatched"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig13mProcessorMapE", "tensorrt_llm::executor::LogitsPostProcessorConfig::mProcessorMap"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig10mReplicateE", "tensorrt_llm::executor::LogitsPostProcessorConfig::mReplicate"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig19setProcessorBatchedERK26LogitsPostProcessorBatched", "tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorBatched"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig19setProcessorBatchedERK26LogitsPostProcessorBatched", "tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorBatched::processorBatched"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig15setProcessorMapERK22LogitsPostProcessorMap", "tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorMap"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig15setProcessorMapERK22LogitsPostProcessorMap", "tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorMap::processorMap"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig12setReplicateEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::setReplicate"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25LogitsPostProcessorConfig12setReplicateEb", "tensorrt_llm::executor::LogitsPostProcessorConfig::setReplicate::replicate"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor22LogitsPostProcessorMapE", "tensorrt_llm::executor::LogitsPostProcessorMap"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfigE", "tensorrt_llm::executor::LookaheadDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigEv", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig::ngramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig::verificationSetSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig23LookaheadDecodingConfigE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig::windowSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig28calculateSpeculativeResourceEv", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResource"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResourceTuple"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResourceTuple::ngramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResourceTuple::verificationSetSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig33calculateSpeculativeResourceTupleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResourceTuple::windowSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig3getEv", "tensorrt_llm::executor::LookaheadDecodingConfig::get"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig12getNgramSizeEv", "tensorrt_llm::executor::LookaheadDecodingConfig::getNgramSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig22getVerificationSetSizeEv", "tensorrt_llm::executor::LookaheadDecodingConfig::getVerificationSetSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig13getWindowSizeEv", "tensorrt_llm::executor::LookaheadDecodingConfig::getWindowSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig4isLEERK23LookaheadDecodingConfig", "tensorrt_llm::executor::LookaheadDecodingConfig::isLE"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfig4isLEERK23LookaheadDecodingConfig", "tensorrt_llm::executor::LookaheadDecodingConfig::isLE::that"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::isLegal"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::isLegal::ngramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::isLegal::verificationSetSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig7isLegalE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::executor::LookaheadDecodingConfig::isLegal::windowSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig30kDefaultLookaheadDecodingNgramE", "tensorrt_llm::executor::LookaheadDecodingConfig::kDefaultLookaheadDecodingNgram"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig40kDefaultLookaheadDecodingVerificationSetE", "tensorrt_llm::executor::LookaheadDecodingConfig::kDefaultLookaheadDecodingVerificationSet"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig31kDefaultLookaheadDecodingWindowE", "tensorrt_llm::executor::LookaheadDecodingConfig::kDefaultLookaheadDecodingWindow"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig10mNgramSizeE", "tensorrt_llm::executor::LookaheadDecodingConfig::mNgramSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig20mVerificationSetSizeE", "tensorrt_llm::executor::LookaheadDecodingConfig::mVerificationSetSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor23LookaheadDecodingConfig11mWindowSizeE", "tensorrt_llm::executor::LookaheadDecodingConfig::mWindowSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfigeqERK23LookaheadDecodingConfig", "tensorrt_llm::executor::LookaheadDecodingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor23LookaheadDecodingConfigeqERK23LookaheadDecodingConfig", "tensorrt_llm::executor::LookaheadDecodingConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfigE", "tensorrt_llm::executor::LoraConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", "tensorrt_llm::executor::LoraConfig::LoraConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", "tensorrt_llm::executor::LoraConfig::LoraConfig::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", "tensorrt_llm::executor::LoraConfig::LoraConfig::taskId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig10LoraConfigE6IdTypeNSt8optionalI6TensorEENSt8optionalI6TensorEE", "tensorrt_llm::executor::LoraConfig::LoraConfig::weights"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor10LoraConfig9getConfigEv", "tensorrt_llm::executor::LoraConfig::getConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor10LoraConfig9getTaskIdEv", "tensorrt_llm::executor::LoraConfig::getTaskId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor10LoraConfig10getWeightsEv", "tensorrt_llm::executor::LoraConfig::getWeights"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig7mConfigE", "tensorrt_llm::executor::LoraConfig::mConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig7mTaskIdE", "tensorrt_llm::executor::LoraConfig::mTaskId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10LoraConfig8mWeightsE", "tensorrt_llm::executor::LoraConfig::mWeights"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor13MedusaChoicesE", "tensorrt_llm::executor::MedusaChoices"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor10MemoryTypeE", "tensorrt_llm::executor::MemoryType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType4kCPUE", "tensorrt_llm::executor::MemoryType::kCPU"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType11kCPU_PINNEDE", "tensorrt_llm::executor::MemoryType::kCPU_PINNED"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType15kCPU_PINNEDPOOLE", "tensorrt_llm::executor::MemoryType::kCPU_PINNEDPOOL"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType4kGPUE", "tensorrt_llm::executor::MemoryType::kGPU"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType8kUNKNOWNE", "tensorrt_llm::executor::MemoryType::kUNKNOWN"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor10MemoryType4kUVME", "tensorrt_llm::executor::MemoryType::kUVM"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor16MillisecondsTypeE", "tensorrt_llm::executor::MillisecondsType"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor9ModelTypeE", "tensorrt_llm::executor::ModelType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor9ModelType13kDECODER_ONLYE", "tensorrt_llm::executor::ModelType::kDECODER_ONLY"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor9ModelType16kENCODER_DECODERE", "tensorrt_llm::executor::ModelType::kENCODER_DECODER"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor9ModelType13kENCODER_ONLYE", "tensorrt_llm::executor::ModelType::kENCODER_ONLY"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfigE", "tensorrt_llm::executor::MropeConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig11MropeConfigE6Tensor10SizeType32", "tensorrt_llm::executor::MropeConfig::MropeConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig11MropeConfigE6Tensor10SizeType32", "tensorrt_llm::executor::MropeConfig::MropeConfig::mropePositionDeltas"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig11MropeConfigE6Tensor10SizeType32", "tensorrt_llm::executor::MropeConfig::MropeConfig::mropeRoratySinCos"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11MropeConfig22getMRopePositionDeltasEv", "tensorrt_llm::executor::MropeConfig::getMRopePositionDeltas"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor11MropeConfig20getMRopeRotaryCosSinEv", "tensorrt_llm::executor::MropeConfig::getMRopeRotaryCosSin"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig20mMRopePositionDeltasE", "tensorrt_llm::executor::MropeConfig::mMRopePositionDeltas"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor11MropeConfig18mMRopeRotaryCosSinE", "tensorrt_llm::executor::MropeConfig::mMRopeRotaryCosSin"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfigE", "tensorrt_llm::executor::OrchestratorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig::isOrchestrator"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig::orchLeaderComm"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig::spawnProcesses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig18OrchestratorConfigEbNSt6stringENSt10shared_ptrIN3mpi7MpiCommEEEb", "tensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig::workerExecutablePath"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getIsOrchestratorEv", "tensorrt_llm::executor::OrchestratorConfig::getIsOrchestrator"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getOrchLeaderCommEv", "tensorrt_llm::executor::OrchestratorConfig::getOrchLeaderComm"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig17getSpawnProcessesEv", "tensorrt_llm::executor::OrchestratorConfig::getSpawnProcesses"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18OrchestratorConfig23getWorkerExecutablePathEv", "tensorrt_llm::executor::OrchestratorConfig::getWorkerExecutablePath"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mIsOrchestratorE", "tensorrt_llm::executor::OrchestratorConfig::mIsOrchestrator"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mOrchLeaderCommE", "tensorrt_llm::executor::OrchestratorConfig::mOrchLeaderComm"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig15mSpawnProcessesE", "tensorrt_llm::executor::OrchestratorConfig::mSpawnProcesses"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig21mWorkerExecutablePathE", "tensorrt_llm::executor::OrchestratorConfig::mWorkerExecutablePath"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setIsOrchestratorEb", "tensorrt_llm::executor::OrchestratorConfig::setIsOrchestrator"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setIsOrchestratorEb", "tensorrt_llm::executor::OrchestratorConfig::setIsOrchestrator::isOrchestrator"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setOrchLeaderCommERKNSt10shared_ptrIN3mpi7MpiCommEEE", "tensorrt_llm::executor::OrchestratorConfig::setOrchLeaderComm"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setOrchLeaderCommERKNSt10shared_ptrIN3mpi7MpiCommEEE", "tensorrt_llm::executor::OrchestratorConfig::setOrchLeaderComm::orchLeaderComm"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setSpawnProcessesEb", "tensorrt_llm::executor::OrchestratorConfig::setSpawnProcesses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig17setSpawnProcessesEb", "tensorrt_llm::executor::OrchestratorConfig::setSpawnProcesses::spawnProcesses"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig23setWorkerExecutablePathERKNSt6stringE", "tensorrt_llm::executor::OrchestratorConfig::setWorkerExecutablePath"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18OrchestratorConfig23setWorkerExecutablePathERKNSt6stringE", "tensorrt_llm::executor::OrchestratorConfig::setWorkerExecutablePath::workerExecutablePath"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfigE", "tensorrt_llm::executor::OutputConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::additionalModelOutputs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::excludeInputFromOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnContextLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnEncoderOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnGenerationLogits"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnLogProbs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig12OutputConfigEbbbbbbNSt8optionalINSt6vectorI21AdditionalModelOutputEEEE", "tensorrt_llm::executor::OutputConfig::OutputConfig::returnPerfMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig22additionalModelOutputsE", "tensorrt_llm::executor::OutputConfig::additionalModelOutputs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig22excludeInputFromOutputE", "tensorrt_llm::executor::OutputConfig::excludeInputFromOutput"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig19returnContextLogitsE", "tensorrt_llm::executor::OutputConfig::returnContextLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig19returnEncoderOutputE", "tensorrt_llm::executor::OutputConfig::returnEncoderOutput"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig22returnGenerationLogitsE", "tensorrt_llm::executor::OutputConfig::returnGenerationLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig14returnLogProbsE", "tensorrt_llm::executor::OutputConfig::returnLogProbs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12OutputConfig17returnPerfMetricsE", "tensorrt_llm::executor::OutputConfig::returnPerfMetrics"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfigE", "tensorrt_llm::executor::ParallelConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::commMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::commType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::deviceIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::numNodes"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::orchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig14ParallelConfigE17CommunicationType17CommunicationModeNSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt6vectorI10SizeType32EEEERKNSt8optionalI18OrchestratorConfigEENSt8optionalI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::ParallelConfig::participantIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig20getCommunicationModeEv", "tensorrt_llm::executor::ParallelConfig::getCommunicationMode"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig20getCommunicationTypeEv", "tensorrt_llm::executor::ParallelConfig::getCommunicationType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig12getDeviceIdsEv", "tensorrt_llm::executor::ParallelConfig::getDeviceIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig11getNumNodesEv", "tensorrt_llm::executor::ParallelConfig::getNumNodes"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig21getOrchestratorConfigEv", "tensorrt_llm::executor::ParallelConfig::getOrchestratorConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14ParallelConfig17getParticipantIdsEv", "tensorrt_llm::executor::ParallelConfig::getParticipantIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mCommModeE", "tensorrt_llm::executor::ParallelConfig::mCommMode"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mCommTypeE", "tensorrt_llm::executor::ParallelConfig::mCommType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig10mDeviceIdsE", "tensorrt_llm::executor::ParallelConfig::mDeviceIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig9mNumNodesE", "tensorrt_llm::executor::ParallelConfig::mNumNodes"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig19mOrchestratorConfigE", "tensorrt_llm::executor::ParallelConfig::mOrchestratorConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig15mParticipantIdsE", "tensorrt_llm::executor::ParallelConfig::mParticipantIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationModeE17CommunicationMode", "tensorrt_llm::executor::ParallelConfig::setCommunicationMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationModeE17CommunicationMode", "tensorrt_llm::executor::ParallelConfig::setCommunicationMode::mode"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationTypeE17CommunicationType", "tensorrt_llm::executor::ParallelConfig::setCommunicationType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig20setCommunicationTypeE17CommunicationType", "tensorrt_llm::executor::ParallelConfig::setCommunicationType::type"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig12setDeviceIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::setDeviceIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig12setDeviceIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::setDeviceIds::deviceIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig11setNumNodesE10SizeType32", "tensorrt_llm::executor::ParallelConfig::setNumNodes"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig11setNumNodesE10SizeType32", "tensorrt_llm::executor::ParallelConfig::setNumNodes::numNodes"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig21setOrchestratorConfigERK18OrchestratorConfig", "tensorrt_llm::executor::ParallelConfig::setOrchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig21setOrchestratorConfigERK18OrchestratorConfig", "tensorrt_llm::executor::ParallelConfig::setOrchestratorConfig::orchestratorConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig17setParticipantIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::setParticipantIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14ParallelConfig17setParticipantIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::ParallelConfig::setParticipantIds::participantIds"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfigE", "tensorrt_llm::executor::PeftCacheConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::deviceCachePercent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::hostCacheSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::loraPrefetchDir"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::maxAdapterSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::maxPagesPerBlockDevice"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::maxPagesPerBlockHost"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numCopyStreams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numDeviceModuleLayer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numEnsureWorkers"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numHostModuleLayer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::numPutWorkers"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15PeftCacheConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalIfEERKNSt8optionalI6size_tEERKNSt8optionalINSt6stringEEE", "tensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig::optimalAdapterSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getDeviceCachePercentEv", "tensorrt_llm::executor::PeftCacheConfig::getDeviceCachePercent"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig16getHostCacheSizeEv", "tensorrt_llm::executor::PeftCacheConfig::getHostCacheSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig18getLoraPrefetchDirEv", "tensorrt_llm::executor::PeftCacheConfig::getLoraPrefetchDir"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig17getMaxAdapterSizeEv", "tensorrt_llm::executor::PeftCacheConfig::getMaxAdapterSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig25getMaxPagesPerBlockDeviceEv", "tensorrt_llm::executor::PeftCacheConfig::getMaxPagesPerBlockDevice"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig23getMaxPagesPerBlockHostEv", "tensorrt_llm::executor::PeftCacheConfig::getMaxPagesPerBlockHost"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig17getNumCopyStreamsEv", "tensorrt_llm::executor::PeftCacheConfig::getNumCopyStreams"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig23getNumDeviceModuleLayerEv", "tensorrt_llm::executor::PeftCacheConfig::getNumDeviceModuleLayer"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig19getNumEnsureWorkersEv", "tensorrt_llm::executor::PeftCacheConfig::getNumEnsureWorkers"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getNumHostModuleLayerEv", "tensorrt_llm::executor::PeftCacheConfig::getNumHostModuleLayer"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig16getNumPutWorkersEv", "tensorrt_llm::executor::PeftCacheConfig::getNumPutWorkers"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfig21getOptimalAdapterSizeEv", "tensorrt_llm::executor::PeftCacheConfig::getOptimalAdapterSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig22kDefaultMaxAdapterSizeE", "tensorrt_llm::executor::PeftCacheConfig::kDefaultMaxAdapterSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig30kDefaultMaxPagesPerBlockDeviceE", "tensorrt_llm::executor::PeftCacheConfig::kDefaultMaxPagesPerBlockDevice"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig28kDefaultMaxPagesPerBlockHostE", "tensorrt_llm::executor::PeftCacheConfig::kDefaultMaxPagesPerBlockHost"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig26kDefaultOptimalAdapterSizeE", "tensorrt_llm::executor::PeftCacheConfig::kDefaultOptimalAdapterSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mDeviceCachePercentE", "tensorrt_llm::executor::PeftCacheConfig::mDeviceCachePercent"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig14mHostCacheSizeE", "tensorrt_llm::executor::PeftCacheConfig::mHostCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig16mLoraPrefetchDirE", "tensorrt_llm::executor::PeftCacheConfig::mLoraPrefetchDir"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15mMaxAdapterSizeE", "tensorrt_llm::executor::PeftCacheConfig::mMaxAdapterSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig23mMaxPagesPerBlockDeviceE", "tensorrt_llm::executor::PeftCacheConfig::mMaxPagesPerBlockDevice"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig21mMaxPagesPerBlockHostE", "tensorrt_llm::executor::PeftCacheConfig::mMaxPagesPerBlockHost"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig15mNumCopyStreamsE", "tensorrt_llm::executor::PeftCacheConfig::mNumCopyStreams"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig21mNumDeviceModuleLayerE", "tensorrt_llm::executor::PeftCacheConfig::mNumDeviceModuleLayer"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig17mNumEnsureWorkersE", "tensorrt_llm::executor::PeftCacheConfig::mNumEnsureWorkers"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mNumHostModuleLayerE", "tensorrt_llm::executor::PeftCacheConfig::mNumHostModuleLayer"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig14mNumPutWorkersE", "tensorrt_llm::executor::PeftCacheConfig::mNumPutWorkers"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15PeftCacheConfig19mOptimalAdapterSizeE", "tensorrt_llm::executor::PeftCacheConfig::mOptimalAdapterSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfigeqERK15PeftCacheConfig", "tensorrt_llm::executor::PeftCacheConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor15PeftCacheConfigeqERK15PeftCacheConfig", "tensorrt_llm::executor::PeftCacheConfig::operator==::other"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor12PriorityTypeE", "tensorrt_llm::executor::PriorityType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfigE", "tensorrt_llm::executor::PromptTuningConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig18PromptTuningConfigE6TensorNSt8optionalI16VecTokenExtraIdsEE", "tensorrt_llm::executor::PromptTuningConfig::PromptTuningConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig18PromptTuningConfigE6TensorNSt8optionalI16VecTokenExtraIdsEE", "tensorrt_llm::executor::PromptTuningConfig::PromptTuningConfig::embeddingTable"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig18PromptTuningConfigE6TensorNSt8optionalI16VecTokenExtraIdsEE", "tensorrt_llm::executor::PromptTuningConfig::PromptTuningConfig::inputTokenExtraIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18PromptTuningConfig17getEmbeddingTableEv", "tensorrt_llm::executor::PromptTuningConfig::getEmbeddingTable"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor18PromptTuningConfig21getInputTokenExtraIdsEv", "tensorrt_llm::executor::PromptTuningConfig::getInputTokenExtraIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig15mEmbeddingTableE", "tensorrt_llm::executor::PromptTuningConfig::mEmbeddingTable"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18PromptTuningConfig19mInputTokenExtraIdsE", "tensorrt_llm::executor::PromptTuningConfig::mInputTokenExtraIds"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor14RandomSeedTypeE", "tensorrt_llm::executor::RandomSeedType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor7RequestE", "tensorrt_llm::executor::Request"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestERK7Request", "tensorrt_llm::executor::Request::Request"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestERR7Request", "tensorrt_llm::executor::Request::Request"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::allottedTimeMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::badWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::clientId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::contextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::crossAttentionMask"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::eagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::embeddingBias"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::encoderInputFeatures"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::encoderInputTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::encoderOutputLength"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::endId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::externalDraftTokensConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::guidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::inputTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::kvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::languageAdapterUid"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::logitsPostProcessor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::logitsPostProcessorName"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::lookaheadConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::loraConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::mRopeConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::maxTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::multimodalEmbedding"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::numReturnSequences"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestERK7Request", "tensorrt_llm::executor::Request::Request::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestERR7Request", "tensorrt_llm::executor::Request::Request::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::outputConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::pTuningConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::padId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::positionIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::priority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::returnAllGeneratedTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::samplingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::skipCrossAttnBlocks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::stopWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::streaming"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request7RequestE9VecTokens10SizeType32bRK14SamplingConfigRK12OutputConfigRKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalINSt4listI9VecTokensEEEENSt8optionalI6TensorEENSt8optionalI25ExternalDraftTokensConfigEENSt8optionalI18PromptTuningConfigEENSt8optionalI6TensorEENSt8optionalI11MropeConfigEENSt8optionalI10LoraConfigEENSt8optionalI23LookaheadDecodingConfigEENSt8optionalI22KvCacheRetentionConfigEENSt8optionalINSt6stringEEENSt8optionalI19LogitsPostProcessorEENSt8optionalI9VecTokensEENSt8optionalI6IdTypeEEb12PriorityType11RequestTypeNSt8optionalI18ContextPhaseParamsEENSt8optionalI6TensorEENSt8optionalI10SizeType32EENSt8optionalI6TensorEE10SizeType32NSt8optionalI11EagleConfigEENSt8optionalI6TensorEENSt8optionalI20GuidedDecodingParamsEENSt8optionalI10SizeType32EENSt8optionalI16MillisecondsTypeEE", "tensorrt_llm::executor::Request::Request::type"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request24getAdditionalOutputNamesEv", "tensorrt_llm::executor::Request::getAdditionalOutputNames"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request17getAllottedTimeMsEv", "tensorrt_llm::executor::Request::getAllottedTimeMs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request11getBadWordsEv", "tensorrt_llm::executor::Request::getBadWords"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request11getClientIdEv", "tensorrt_llm::executor::Request::getClientId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request21getContextPhaseParamsEv", "tensorrt_llm::executor::Request::getContextPhaseParams"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request21getCrossAttentionMaskEv", "tensorrt_llm::executor::Request::getCrossAttentionMask"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request14getEagleConfigEv", "tensorrt_llm::executor::Request::getEagleConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request16getEmbeddingBiasEv", "tensorrt_llm::executor::Request::getEmbeddingBias"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request23getEncoderInputFeaturesEv", "tensorrt_llm::executor::Request::getEncoderInputFeatures"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request23getEncoderInputTokenIdsEv", "tensorrt_llm::executor::Request::getEncoderInputTokenIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request22getEncoderOutputLengthEv", "tensorrt_llm::executor::Request::getEncoderOutputLength"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request8getEndIdEv", "tensorrt_llm::executor::Request::getEndId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request28getExternalDraftTokensConfigEv", "tensorrt_llm::executor::Request::getExternalDraftTokensConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request23getGuidedDecodingParamsEv", "tensorrt_llm::executor::Request::getGuidedDecodingParams"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request16getInputTokenIdsEv", "tensorrt_llm::executor::Request::getInputTokenIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request25getKvCacheRetentionConfigEv", "tensorrt_llm::executor::Request::getKvCacheRetentionConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request21getLanguageAdapterUidEv", "tensorrt_llm::executor::Request::getLanguageAdapterUid"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request22getLogitsPostProcessorEv", "tensorrt_llm::executor::Request::getLogitsPostProcessor"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request26getLogitsPostProcessorNameEv", "tensorrt_llm::executor::Request::getLogitsPostProcessorName"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request18getLookaheadConfigEv", "tensorrt_llm::executor::Request::getLookaheadConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request13getLoraConfigEv", "tensorrt_llm::executor::Request::getLoraConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request12getMaxTokensEv", "tensorrt_llm::executor::Request::getMaxTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request14getMropeConfigEv", "tensorrt_llm::executor::Request::getMropeConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request22getMultimodalEmbeddingEv", "tensorrt_llm::executor::Request::getMultimodalEmbedding"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request15getOutputConfigEv", "tensorrt_llm::executor::Request::getOutputConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request8getPadIdEv", "tensorrt_llm::executor::Request::getPadId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request14getPositionIdsEv", "tensorrt_llm::executor::Request::getPositionIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request11getPriorityEv", "tensorrt_llm::executor::Request::getPriority"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request21getPromptTuningConfigEv", "tensorrt_llm::executor::Request::getPromptTuningConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request14getRequestTypeEv", "tensorrt_llm::executor::Request::getRequestType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request27getReturnAllGeneratedTokensEv", "tensorrt_llm::executor::Request::getReturnAllGeneratedTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request17getSamplingConfigEv", "tensorrt_llm::executor::Request::getSamplingConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request22getSkipCrossAttnBlocksEv", "tensorrt_llm::executor::Request::getSkipCrossAttnBlocks"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request12getStopWordsEv", "tensorrt_llm::executor::Request::getStopWords"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor7Request12getStreamingEv", "tensorrt_llm::executor::Request::getStreaming"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor7Request25kBatchedPostProcessorNameE", "tensorrt_llm::executor::Request::kBatchedPostProcessorName"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor7Request16kDefaultPriorityE", "tensorrt_llm::executor::Request::kDefaultPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor7Request31kDynamicPostProcessorNamePrefixE", "tensorrt_llm::executor::Request::kDynamicPostProcessorNamePrefix"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor7Request5mImplE", "tensorrt_llm::executor::Request::mImpl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7RequestaSERK7Request", "tensorrt_llm::executor::Request::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7RequestaSERR7Request", "tensorrt_llm::executor::Request::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7RequestaSERK7Request", "tensorrt_llm::executor::Request::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7RequestaSERR7Request", "tensorrt_llm::executor::Request::operator=::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request17setAllottedTimeMsE16MillisecondsType", "tensorrt_llm::executor::Request::setAllottedTimeMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request17setAllottedTimeMsE16MillisecondsType", "tensorrt_llm::executor::Request::setAllottedTimeMs::allottedTimeMs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request11setBadWordsERKNSt4listI9VecTokensEE", "tensorrt_llm::executor::Request::setBadWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request11setBadWordsERKNSt4listI9VecTokensEE", "tensorrt_llm::executor::Request::setBadWords::badWords"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request11setClientIdE6IdType", "tensorrt_llm::executor::Request::setClientId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request11setClientIdE6IdType", "tensorrt_llm::executor::Request::setClientId::clientId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request21setContextPhaseParamsE18ContextPhaseParams", "tensorrt_llm::executor::Request::setContextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request21setContextPhaseParamsE18ContextPhaseParams", "tensorrt_llm::executor::Request::setContextPhaseParams::contextPhaseParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request21setCrossAttentionMaskE6Tensor", "tensorrt_llm::executor::Request::setCrossAttentionMask"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request21setCrossAttentionMaskE6Tensor", "tensorrt_llm::executor::Request::setCrossAttentionMask::crossAttentionMask"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request14setEagleConfigERKNSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::Request::setEagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request14setEagleConfigERKNSt8optionalI11EagleConfigEE", "tensorrt_llm::executor::Request::setEagleConfig::eagleConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request16setEmbeddingBiasERK6Tensor", "tensorrt_llm::executor::Request::setEmbeddingBias"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request16setEmbeddingBiasERK6Tensor", "tensorrt_llm::executor::Request::setEmbeddingBias::embeddingBias"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputFeaturesE6Tensor", "tensorrt_llm::executor::Request::setEncoderInputFeatures"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputFeaturesE6Tensor", "tensorrt_llm::executor::Request::setEncoderInputFeatures::encoderInputFeatures"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputTokenIdsERK9VecTokens", "tensorrt_llm::executor::Request::setEncoderInputTokenIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request23setEncoderInputTokenIdsERK9VecTokens", "tensorrt_llm::executor::Request::setEncoderInputTokenIds::encoderInputTokenIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request22setEncoderOutputLengthE10SizeType32", "tensorrt_llm::executor::Request::setEncoderOutputLength"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request22setEncoderOutputLengthE10SizeType32", "tensorrt_llm::executor::Request::setEncoderOutputLength::encoderOutputLength"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request8setEndIdE10SizeType32", "tensorrt_llm::executor::Request::setEndId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request8setEndIdE10SizeType32", "tensorrt_llm::executor::Request::setEndId::endId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request28setExternalDraftTokensConfigERK25ExternalDraftTokensConfig", "tensorrt_llm::executor::Request::setExternalDraftTokensConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request28setExternalDraftTokensConfigERK25ExternalDraftTokensConfig", "tensorrt_llm::executor::Request::setExternalDraftTokensConfig::externalDraftTokensConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request23setGuidedDecodingParamsERK20GuidedDecodingParams", "tensorrt_llm::executor::Request::setGuidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request23setGuidedDecodingParamsERK20GuidedDecodingParams", "tensorrt_llm::executor::Request::setGuidedDecodingParams::guidedDecodingParams"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request25setKvCacheRetentionConfigERK22KvCacheRetentionConfig", "tensorrt_llm::executor::Request::setKvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request25setKvCacheRetentionConfigERK22KvCacheRetentionConfig", "tensorrt_llm::executor::Request::setKvCacheRetentionConfig::kvCacheRetentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request21setLanguageAdapterUidE10SizeType32", "tensorrt_llm::executor::Request::setLanguageAdapterUid"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request21setLanguageAdapterUidE10SizeType32", "tensorrt_llm::executor::Request::setLanguageAdapterUid::languageAdapterUid"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request22setLogitsPostProcessorERKNSt8optionalI19LogitsPostProcessorEE", "tensorrt_llm::executor::Request::setLogitsPostProcessor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request22setLogitsPostProcessorERKNSt8optionalI19LogitsPostProcessorEE", "tensorrt_llm::executor::Request::setLogitsPostProcessor::logitsPostProcessor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request26setLogitsPostProcessorNameERKNSt6stringE", "tensorrt_llm::executor::Request::setLogitsPostProcessorName"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request26setLogitsPostProcessorNameERKNSt6stringE", "tensorrt_llm::executor::Request::setLogitsPostProcessorName::logitsPostProcessorName"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request18setLookaheadConfigERK23LookaheadDecodingConfig", "tensorrt_llm::executor::Request::setLookaheadConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request18setLookaheadConfigERK23LookaheadDecodingConfig", "tensorrt_llm::executor::Request::setLookaheadConfig::lookaheadConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request13setLoraConfigERK10LoraConfig", "tensorrt_llm::executor::Request::setLoraConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request13setLoraConfigERK10LoraConfig", "tensorrt_llm::executor::Request::setLoraConfig::loraConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request14setMropeConfigERK11MropeConfig", "tensorrt_llm::executor::Request::setMropeConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request14setMropeConfigERK11MropeConfig", "tensorrt_llm::executor::Request::setMropeConfig::mRopeConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request22setMultimodalEmbeddingERK6Tensor", "tensorrt_llm::executor::Request::setMultimodalEmbedding"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request22setMultimodalEmbeddingERK6Tensor", "tensorrt_llm::executor::Request::setMultimodalEmbedding::multimodalEmbedding"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request15setOutputConfigERK12OutputConfig", "tensorrt_llm::executor::Request::setOutputConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request15setOutputConfigERK12OutputConfig", "tensorrt_llm::executor::Request::setOutputConfig::outputConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request8setPadIdE10SizeType32", "tensorrt_llm::executor::Request::setPadId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request8setPadIdE10SizeType32", "tensorrt_llm::executor::Request::setPadId::padId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request14setPositionIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::Request::setPositionIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request14setPositionIdsERKNSt6vectorI10SizeType32EE", "tensorrt_llm::executor::Request::setPositionIds::positionIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request11setPriorityE12PriorityType", "tensorrt_llm::executor::Request::setPriority"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request11setPriorityE12PriorityType", "tensorrt_llm::executor::Request::setPriority::priority"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request21setPromptTuningConfigERK18PromptTuningConfig", "tensorrt_llm::executor::Request::setPromptTuningConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request21setPromptTuningConfigERK18PromptTuningConfig", "tensorrt_llm::executor::Request::setPromptTuningConfig::pTuningConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request14setRequestTypeERK11RequestType", "tensorrt_llm::executor::Request::setRequestType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request14setRequestTypeERK11RequestType", "tensorrt_llm::executor::Request::setRequestType::requestType"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request27setReturnAllGeneratedTokensEb", "tensorrt_llm::executor::Request::setReturnAllGeneratedTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request27setReturnAllGeneratedTokensEb", "tensorrt_llm::executor::Request::setReturnAllGeneratedTokens::returnAllGeneratedTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request17setSamplingConfigERK14SamplingConfig", "tensorrt_llm::executor::Request::setSamplingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request17setSamplingConfigERK14SamplingConfig", "tensorrt_llm::executor::Request::setSamplingConfig::config"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request22setSkipCrossAttnBlocksE6Tensor", "tensorrt_llm::executor::Request::setSkipCrossAttnBlocks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request22setSkipCrossAttnBlocksE6Tensor", "tensorrt_llm::executor::Request::setSkipCrossAttnBlocks::skipCrossAttnBlocks"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request12setStopWordsERKNSt4listI9VecTokensEE", "tensorrt_llm::executor::Request::setStopWords"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request12setStopWordsERKNSt4listI9VecTokensEE", "tensorrt_llm::executor::Request::setStopWords::stopWords"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7Request12setStreamingEb", "tensorrt_llm::executor::Request::setStreaming"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor7Request12setStreamingEb", "tensorrt_llm::executor::Request::setStreaming::streaming"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7RequestD0Ev", "tensorrt_llm::executor::Request::~Request"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetricsE", "tensorrt_llm::executor::RequestPerfMetrics"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics14kvCacheHitRateE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::kvCacheHitRate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics15numMissedBlocksE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::numMissedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics21numNewAllocatedBlocksE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::numNewAllocatedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics15numReusedBlocksE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::numReusedBlocks"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14KvCacheMetrics23numTotalAllocatedBlocksE", "tensorrt_llm::executor::RequestPerfMetrics::KvCacheMetrics::numTotalAllocatedBlocks"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::SpeculativeDecodingMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics14acceptanceRateE", "tensorrt_llm::executor::RequestPerfMetrics::SpeculativeDecodingMetrics::acceptanceRate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics24totalAcceptedDraftTokensE", "tensorrt_llm::executor::RequestPerfMetrics::SpeculativeDecodingMetrics::totalAcceptedDraftTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics26SpeculativeDecodingMetrics16totalDraftTokensE", "tensorrt_llm::executor::RequestPerfMetrics::SpeculativeDecodingMetrics::totalDraftTokens"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics9TimePointE", "tensorrt_llm::executor::RequestPerfMetrics::TimePoint"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics11arrivalTimeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::arrivalTime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics18firstScheduledTimeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::firstScheduledTime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics14firstTokenTimeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::firstTokenTime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics11kvCacheSizeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::kvCacheSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics18kvCacheTransferEndE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::kvCacheTransferEnd"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics20kvCacheTransferStartE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::kvCacheTransferStart"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13TimingMetrics13lastTokenTimeE", "tensorrt_llm::executor::RequestPerfMetrics::TimingMetrics::lastTokenTime"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics9firstIterE", "tensorrt_llm::executor::RequestPerfMetrics::firstIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics4iterE", "tensorrt_llm::executor::RequestPerfMetrics::iter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics14kvCacheMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::kvCacheMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics8lastIterE", "tensorrt_llm::executor::RequestPerfMetrics::lastIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics19speculativeDecodingE", "tensorrt_llm::executor::RequestPerfMetrics::speculativeDecoding"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor18RequestPerfMetrics13timingMetricsE", "tensorrt_llm::executor::RequestPerfMetrics::timingMetrics"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor12RequestStageE", "tensorrt_llm::executor::RequestStage"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage20kCONTEXT_IN_PROGRESSE", "tensorrt_llm::executor::RequestStage::kCONTEXT_IN_PROGRESS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage20kENCODER_IN_PROGRESSE", "tensorrt_llm::executor::RequestStage::kENCODER_IN_PROGRESS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage20kGENERATION_COMPLETEE", "tensorrt_llm::executor::RequestStage::kGENERATION_COMPLETE"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage23kGENERATION_IN_PROGRESSE", "tensorrt_llm::executor::RequestStage::kGENERATION_IN_PROGRESS"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor12RequestStage7kQUEUEDE", "tensorrt_llm::executor::RequestStage::kQUEUED"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor12RequestStatsE", "tensorrt_llm::executor::RequestStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats24allocNewBlocksPerRequestE", "tensorrt_llm::executor::RequestStats::allocNewBlocksPerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats26allocTotalBlocksPerRequestE", "tensorrt_llm::executor::RequestStats::allocTotalBlocksPerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats26avgNumDecodedTokensPerIterE", "tensorrt_llm::executor::RequestStats::avgNumDecodedTokensPerIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats22contextPrefillPositionE", "tensorrt_llm::executor::RequestStats::contextPrefillPosition"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats15disServingStatsE", "tensorrt_llm::executor::RequestStats::disServingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats2idE", "tensorrt_llm::executor::RequestStats::id"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats24kvCacheHitRatePerRequestE", "tensorrt_llm::executor::RequestStats::kvCacheHitRatePerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats22missedBlocksPerRequestE", "tensorrt_llm::executor::RequestStats::missedBlocksPerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats18numGeneratedTokensE", "tensorrt_llm::executor::RequestStats::numGeneratedTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats6pausedE", "tensorrt_llm::executor::RequestStats::paused"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats22reusedBlocksPerRequestE", "tensorrt_llm::executor::RequestStats::reusedBlocksPerRequest"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats9scheduledE", "tensorrt_llm::executor::RequestStats::scheduled"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor12RequestStats5stageE", "tensorrt_llm::executor::RequestStats::stage"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIterationE", "tensorrt_llm::executor::RequestStatsPerIteration"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIteration4iterE", "tensorrt_llm::executor::RequestStatsPerIteration::iter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor24RequestStatsPerIteration12requestStatsE", "tensorrt_llm::executor::RequestStatsPerIteration::requestStats"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor11RequestTypeE", "tensorrt_llm::executor::RequestType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor11RequestType35REQUEST_TYPE_CONTEXT_AND_GENERATIONE", "tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor11RequestType25REQUEST_TYPE_CONTEXT_ONLYE", "tensorrt_llm::executor::RequestType::REQUEST_TYPE_CONTEXT_ONLY"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor11RequestType28REQUEST_TYPE_GENERATION_ONLYE", "tensorrt_llm::executor::RequestType::REQUEST_TYPE_GENERATION_ONLY"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8ResponseE", "tensorrt_llm::executor::Response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERK8Response", "tensorrt_llm::executor::Response::Response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERR8Response", "tensorrt_llm::executor::Response::Response"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::Result"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::clientId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::clientId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::errorMsg"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERK8Response", "tensorrt_llm::executor::Response::Response::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseERR8Response", "tensorrt_llm::executor::Response::Response::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdType6ResultNSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::requestId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8Response8ResponseE6IdTypeNSt6stringENSt8optionalI6IdTypeEE", "tensorrt_llm::executor::Response::Response::requestId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response11getClientIdEv", "tensorrt_llm::executor::Response::getClientId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response11getErrorMsgEv", "tensorrt_llm::executor::Response::getErrorMsg"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response12getRequestIdEv", "tensorrt_llm::executor::Response::getRequestId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response9getResultEv", "tensorrt_llm::executor::Response::getResult"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8Response8hasErrorEv", "tensorrt_llm::executor::Response::hasError"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8Response5mImplE", "tensorrt_llm::executor::Response::mImpl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ResponseaSERK8Response", "tensorrt_llm::executor::Response::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ResponseaSERR8Response", "tensorrt_llm::executor::Response::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8ResponseaSERK8Response", "tensorrt_llm::executor::Response::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8ResponseaSERR8Response", "tensorrt_llm::executor::Response::operator=::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8ResponseD0Ev", "tensorrt_llm::executor::Response::~Response"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor6ResultE", "tensorrt_llm::executor::Result"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result17additionalOutputsE", "tensorrt_llm::executor::Result::additionalOutputs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result13contextLogitsE", "tensorrt_llm::executor::Result::contextLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result18contextPhaseParamsE", "tensorrt_llm::executor::Result::contextPhaseParams"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result11cumLogProbsE", "tensorrt_llm::executor::Result::cumLogProbs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result12decodingIterE", "tensorrt_llm::executor::Result::decodingIter"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result13encoderOutputE", "tensorrt_llm::executor::Result::encoderOutput"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result13finishReasonsE", "tensorrt_llm::executor::Result::finishReasons"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result16generationLogitsE", "tensorrt_llm::executor::Result::generationLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result7isFinalE", "tensorrt_llm::executor::Result::isFinal"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result15isSequenceFinalE", "tensorrt_llm::executor::Result::isSequenceFinal"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result8logProbsE", "tensorrt_llm::executor::Result::logProbs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result14outputTokenIdsE", "tensorrt_llm::executor::Result::outputTokenIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result18requestPerfMetricsE", "tensorrt_llm::executor::Result::requestPerfMetrics"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result13sequenceIndexE", "tensorrt_llm::executor::Result::sequenceIndex"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Result21specDecFastLogitsInfoE", "tensorrt_llm::executor::Result::specDecFastLogitsInfo"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor17RetentionPriorityE", "tensorrt_llm::executor::RetentionPriority"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDurationE", "tensorrt_llm::executor::RetentionPriorityAndDuration"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration28RetentionPriorityAndDurationERKNSt8optionalI17RetentionPriorityEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::RetentionPriorityAndDuration::RetentionPriorityAndDuration"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration28RetentionPriorityAndDurationERKNSt8optionalI17RetentionPriorityEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::RetentionPriorityAndDuration::RetentionPriorityAndDuration::durationMs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration28RetentionPriorityAndDurationERKNSt8optionalI17RetentionPriorityEERKNSt8optionalINSt6chrono12millisecondsEEE", "tensorrt_llm::executor::RetentionPriorityAndDuration::RetentionPriorityAndDuration::retentionPriority"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration10durationMsE", "tensorrt_llm::executor::RetentionPriorityAndDuration::durationMs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor28RetentionPriorityAndDuration17retentionPriorityE", "tensorrt_llm::executor::RetentionPriorityAndDuration::retentionPriority"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfigE", "tensorrt_llm::executor::SamplingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::beamSearchDiversityRate"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::beamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::beamWidthArray"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::earlyStopping"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::frequencyPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::lengthPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::minP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::minTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::noRepeatNgramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::numReturnSequences"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::presencePenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::repetitionPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::seed"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::temperature"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topK"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topPDecay"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topPMin"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14SamplingConfigE10SizeType32RKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI11TokenIdTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI14RandomSeedTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI9FloatTypeEERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI10SizeType32EERKNSt8optionalI9FloatTypeEERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::SamplingConfig::topPResetIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig28checkBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkBeamSearchDiversityRate"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig28checkBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkBeamSearchDiversityRate::beamSearchDiversityRate"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkBeamWidthE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkBeamWidthE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidth::beamWidth"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19checkBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEEK10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidthArray"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19checkBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEEK10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidthArray::beamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19checkBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEEK10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkBeamWidthArray::beamWidthArray"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkEarlyStoppingERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkEarlyStopping"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkEarlyStoppingERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkEarlyStopping::earlyStopping"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkLengthPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkLengthPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18checkLengthPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkLengthPenalty::lengthPenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkMinPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkMinP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkMinPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkMinP::minP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkMinTokensERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkMinTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkMinTokensERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkMinTokens::minTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkNoRepeatNgramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::checkNoRepeatNgramSize::noRepeatNgramSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig23checkNumReturnSequencesERKNSt8optionalI10SizeType32EE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkNumReturnSequences"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig23checkNumReturnSequencesERKNSt8optionalI10SizeType32EE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkNumReturnSequences::beamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig23checkNumReturnSequencesERKNSt8optionalI10SizeType32EE10SizeType32", "tensorrt_llm::executor::SamplingConfig::checkNumReturnSequences::numReturnSequences"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkRepetitionPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig22checkRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkRepetitionPenalty::repetitionpenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16checkTemperatureERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTemperature"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16checkTemperatureERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTemperature::temperature"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopKERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopK"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopKERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopK::topK"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig9checkTopPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopP::topP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkTopPDecayERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPDecay"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14checkTopPDecayERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPDecay::topPDecay"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12checkTopPMinERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPMin"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12checkTopPMinERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPMin::topPMin"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17checkTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPResetIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17checkTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", "tensorrt_llm::executor::SamplingConfig::checkTopPResetIds::topPResetIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig26getBeamSearchDiversityRateEv", "tensorrt_llm::executor::SamplingConfig::getBeamSearchDiversityRate"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getBeamWidthEv", "tensorrt_llm::executor::SamplingConfig::getBeamWidth"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig17getBeamWidthArrayEv", "tensorrt_llm::executor::SamplingConfig::getBeamWidthArray"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig16getEarlyStoppingEv", "tensorrt_llm::executor::SamplingConfig::getEarlyStopping"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig19getFrequencyPenaltyEv", "tensorrt_llm::executor::SamplingConfig::getFrequencyPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig16getLengthPenaltyEv", "tensorrt_llm::executor::SamplingConfig::getLengthPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getMinPEv", "tensorrt_llm::executor::SamplingConfig::getMinP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getMinTokensEv", "tensorrt_llm::executor::SamplingConfig::getMinTokens"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig20getNoRepeatNgramSizeEv", "tensorrt_llm::executor::SamplingConfig::getNoRepeatNgramSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig17getNumReturnBeamsEv", "tensorrt_llm::executor::SamplingConfig::getNumReturnBeams"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig21getNumReturnSequencesEv", "tensorrt_llm::executor::SamplingConfig::getNumReturnSequences"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig18getPresencePenaltyEv", "tensorrt_llm::executor::SamplingConfig::getPresencePenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig20getRepetitionPenaltyEv", "tensorrt_llm::executor::SamplingConfig::getRepetitionPenalty"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getSeedEv", "tensorrt_llm::executor::SamplingConfig::getSeed"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig14getTemperatureEv", "tensorrt_llm::executor::SamplingConfig::getTemperature"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getTopKEv", "tensorrt_llm::executor::SamplingConfig::getTopK"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig7getTopPEv", "tensorrt_llm::executor::SamplingConfig::getTopP"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig12getTopPDecayEv", "tensorrt_llm::executor::SamplingConfig::getTopPDecay"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig10getTopPMinEv", "tensorrt_llm::executor::SamplingConfig::getTopPMin"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfig15getTopPResetIdsEv", "tensorrt_llm::executor::SamplingConfig::getTopPResetIds"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig24mBeamSearchDiversityRateE", "tensorrt_llm::executor::SamplingConfig::mBeamSearchDiversityRate"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mBeamWidthE", "tensorrt_llm::executor::SamplingConfig::mBeamWidth"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15mBeamWidthArrayE", "tensorrt_llm::executor::SamplingConfig::mBeamWidthArray"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14mEarlyStoppingE", "tensorrt_llm::executor::SamplingConfig::mEarlyStopping"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17mFrequencyPenaltyE", "tensorrt_llm::executor::SamplingConfig::mFrequencyPenalty"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14mLengthPenaltyE", "tensorrt_llm::executor::SamplingConfig::mLengthPenalty"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mMinPE", "tensorrt_llm::executor::SamplingConfig::mMinP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mMinTokensE", "tensorrt_llm::executor::SamplingConfig::mMinTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18mNoRepeatNgramSizeE", "tensorrt_llm::executor::SamplingConfig::mNoRepeatNgramSize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15mNumReturnBeamsE", "tensorrt_llm::executor::SamplingConfig::mNumReturnBeams"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19mNumReturnSequencesE", "tensorrt_llm::executor::SamplingConfig::mNumReturnSequences"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16mPresencePenaltyE", "tensorrt_llm::executor::SamplingConfig::mPresencePenalty"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18mRepetitionPenaltyE", "tensorrt_llm::executor::SamplingConfig::mRepetitionPenalty"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mSeedE", "tensorrt_llm::executor::SamplingConfig::mSeed"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12mTemperatureE", "tensorrt_llm::executor::SamplingConfig::mTemperature"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mTopKE", "tensorrt_llm::executor::SamplingConfig::mTopK"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig5mTopPE", "tensorrt_llm::executor::SamplingConfig::mTopP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10mTopPDecayE", "tensorrt_llm::executor::SamplingConfig::mTopPDecay"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig8mTopPMinE", "tensorrt_llm::executor::SamplingConfig::mTopPMin"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig13mTopPResetIdsE", "tensorrt_llm::executor::SamplingConfig::mTopPResetIds"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfigeqERK14SamplingConfig", "tensorrt_llm::executor::SamplingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor14SamplingConfigeqERK14SamplingConfig", "tensorrt_llm::executor::SamplingConfig::operator==::other"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig26setBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setBeamSearchDiversityRate"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig26setBeamSearchDiversityRateERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setBeamSearchDiversityRate::beamSearchDiversityRate"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setBeamWidthE10SizeType32", "tensorrt_llm::executor::SamplingConfig::setBeamWidth"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setBeamWidthE10SizeType32", "tensorrt_llm::executor::SamplingConfig::setBeamWidth::beamWidth"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17setBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::setBeamWidthArray"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig17setBeamWidthArrayERKNSt8optionalINSt6vectorI10SizeType32EEEE", "tensorrt_llm::executor::SamplingConfig::setBeamWidthArray::beamWidthArray"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setEarlyStoppingERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setEarlyStopping"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setEarlyStoppingERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setEarlyStopping::earlyStopping"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19setFrequencyPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setFrequencyPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig19setFrequencyPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setFrequencyPenalty::frequencyPenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setLengthPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setLengthPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig16setLengthPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setLengthPenalty::lengthPenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setMinPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setMinP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setMinPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setMinP::minP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setMinTokensERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setMinTokens"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setMinTokensERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setMinTokens::minTokens"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setNoRepeatNgramSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setNoRepeatNgramSizeERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setNoRepeatNgramSize::noRepeatNgramSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig21setNumReturnSequencesERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setNumReturnSequences"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig21setNumReturnSequencesERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setNumReturnSequences::numReturnSequences"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18setPresencePenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setPresencePenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig18setPresencePenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setPresencePenalty::presencePenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setRepetitionPenalty"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20setRepetitionPenaltyERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setRepetitionPenalty::repetitionPenalty"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setSeedERKNSt8optionalI14RandomSeedTypeEE", "tensorrt_llm::executor::SamplingConfig::setSeed"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setSeedERKNSt8optionalI14RandomSeedTypeEE", "tensorrt_llm::executor::SamplingConfig::setSeed::seed"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14setTemperatureERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTemperature"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig14setTemperatureERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTemperature::temperature"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopKERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setTopK"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopKERKNSt8optionalI10SizeType32EE", "tensorrt_llm::executor::SamplingConfig::setTopK::topK"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig7setTopPERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopP::topP"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setTopPDecayERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPDecay"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig12setTopPDecayERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPDecay::topPDecay"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10setTopPMinERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPMin"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig10setTopPMinERKNSt8optionalI9FloatTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPMin::topPMin"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15setTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPResetIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig15setTopPResetIdsERKNSt8optionalI11TokenIdTypeEE", "tensorrt_llm::executor::SamplingConfig::setTopPResetIds::topPResetIds"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor14SamplingConfig20updateNumReturnBeamsEv", "tensorrt_llm::executor::SamplingConfig::updateNumReturnBeams"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfigE", "tensorrt_llm::executor::SchedulerConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", "tensorrt_llm::executor::SchedulerConfig::SchedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", "tensorrt_llm::executor::SchedulerConfig::SchedulerConfig::capacitySchedulerPolicy"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", "tensorrt_llm::executor::SchedulerConfig::SchedulerConfig::contextChunkingPolicy"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig15SchedulerConfigE23CapacitySchedulerPolicyNSt8optionalI21ContextChunkingPolicyEENSt8optionalI18DynamicBatchConfigEE", "tensorrt_llm::executor::SchedulerConfig::SchedulerConfig::dynamicBatchConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig26getCapacitySchedulerPolicyEv", "tensorrt_llm::executor::SchedulerConfig::getCapacitySchedulerPolicy"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig24getContextChunkingPolicyEv", "tensorrt_llm::executor::SchedulerConfig::getContextChunkingPolicy"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfig21getDynamicBatchConfigEv", "tensorrt_llm::executor::SchedulerConfig::getDynamicBatchConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig24mCapacitySchedulerPolicyE", "tensorrt_llm::executor::SchedulerConfig::mCapacitySchedulerPolicy"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig22mContextChunkingPolicyE", "tensorrt_llm::executor::SchedulerConfig::mContextChunkingPolicy"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15SchedulerConfig19mDynamicBatchConfigE", "tensorrt_llm::executor::SchedulerConfig::mDynamicBatchConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfigeqERK15SchedulerConfig", "tensorrt_llm::executor::SchedulerConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor15SchedulerConfigeqERK15SchedulerConfig", "tensorrt_llm::executor::SchedulerConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor13SerializationE", "tensorrt_llm::executor::Serialization"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeAdditionalModelOutputERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAdditionalModelOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeAdditionalModelOutputERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAdditionalModelOutput::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization27deserializeAdditionalOutputERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAdditionalOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization27deserializeAdditionalOutputERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAdditionalOutput::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeAgentStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAgentState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeAgentStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeAgentState::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization15deserializeBoolERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeBool"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization15deserializeBoolERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeBool::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCacheState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeCacheStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCacheState::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeCacheTransceiverConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeCacheTransceiverConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCacheTransceiverConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeCommStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCommState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeCommStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeCommState::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeContextPhaseParamsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeContextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeContextPhaseParamsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeContextPhaseParams::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeDataTransceiverState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeDataTransceiverState::buffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeDataTransceiverStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDataTransceiverState::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeDebugConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDebugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeDebugConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDebugConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDecodingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeDecodingModeERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDecodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeDecodingModeERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDecodingMode::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeDisServingRequestStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDisServingRequestStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeDisServingRequestStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDisServingRequestStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeDynamicBatchConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDynamicBatchConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeDynamicBatchConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeDynamicBatchConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeEagleConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeEagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeEagleConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeEagleConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeExecutorConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExecutorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeExecutorConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExecutorConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization40deserializeExtendedRuntimePerfKnobConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExtendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization40deserializeExtendedRuntimePerfKnobConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExtendedRuntimePerfKnobConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeExternalDraftTokensConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExternalDraftTokensConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeExternalDraftTokensConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeExternalDraftTokensConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeGuidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeGuidedDecodingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingParamsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeGuidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization31deserializeGuidedDecodingParamsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeGuidedDecodingParams::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeInflightBatchingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeInflightBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeInflightBatchingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeInflightBatchingStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeIterationStats"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeIterationStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeIterationStats::buffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeIterationStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeIterationStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization28deserializeIterationStatsVecERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeIterationStatsVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization28deserializeIterationStatsVecERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeIterationStatsVec::buffer"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization24deserializeKvCacheConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization24deserializeKvCacheConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeKvCacheRetentionConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization33deserializeKvCacheRetentionConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheRetentionConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeKvCacheStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeKvCacheStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeKvCacheStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization34deserializeLookaheadDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeLookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization34deserializeLookaheadDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeLookaheadDecodingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeLoraConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeLoraConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization21deserializeLoraConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeLoraConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeModelTypeERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeModelType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeModelTypeERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeModelType::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeMropeConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeMropeConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeMropeConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeMropeConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeOrchestratorConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeOrchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeOrchestratorConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeOrchestratorConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeOutputConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeOutputConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeOutputConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeOutputConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeParallelConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeParallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeParallelConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeParallelConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializePeftCacheConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializePeftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializePeftCacheConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializePeftCacheConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializePromptTuningConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializePromptTuningConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializePromptTuningConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializePromptTuningConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization18deserializeRequestERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequest"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization18deserializeRequestERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequest::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeRequestPerfMetricsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestPerfMetrics"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization29deserializeRequestPerfMetricsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestPerfMetrics::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStageERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStage"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStageERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStage::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization23deserializeRequestStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIteration"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIteration"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIteration::buffer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization35deserializeRequestStatsPerIterationERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIteration::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization38deserializeRequestStatsPerIterationVecERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIterationVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization38deserializeRequestStatsPerIterationVecERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeRequestStatsPerIterationVec::buffer"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization19deserializeResponseERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeResponse"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization19deserializeResponseERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeResponse::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeResponsesERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeResponses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeResponsesERNSt6vectorIcEE", "tensorrt_llm::executor::Serialization::deserializeResponses::buffer"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeResultERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeResult"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeResultERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeResult::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeSamplingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSamplingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization25deserializeSamplingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSamplingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializeSchedulerConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSchedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization26deserializeSchedulerConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSchedulerConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeSocketStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSocketState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization22deserializeSocketStateERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSocketState::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeSpecDecFastLogitsInfoERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpecDecFastLogitsInfo"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization32deserializeSpecDecFastLogitsInfoERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpecDecFastLogitsInfo::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization28deserializeSpecDecodingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpecDecodingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization28deserializeSpecDecodingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpecDecodingStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeSpeculativeDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpeculativeDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeSpeculativeDecodingConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeSpeculativeDecodingConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization30deserializeStaticBatchingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeStaticBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization30deserializeStaticBatchingStatsERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeStaticBatchingStats::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeStringERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeString"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeStringERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeString::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeTensorERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization17deserializeTensorERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTensor::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeTimePointERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTimePoint"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization20deserializeTimePointERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTimePoint::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeTokenRangeRetentionConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTokenRangeRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization36deserializeTokenRangeRetentionConfigERNSt7istreamE", "tensorrt_llm::executor::Serialization::deserializeTokenRangeRetentionConfig::is"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStats", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverState", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIteration", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI14IterationStatsEE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI24RequestStatsPerIterationEE", "tensorrt_llm::executor::Serialization::serialize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI8ResponseEE", "tensorrt_llm::executor::Serialization::serialize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::additionalModelOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::additionalOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::cacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::contextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverState", "tensorrt_llm::executor::Serialization::serialize::dataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::dataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::debugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::decodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::decodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::dynamicBatchConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::eagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::extendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::guidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::guidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::inflightBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::info"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStats", "tensorrt_llm::executor::Serialization::serialize::iterStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::iterStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI14IterationStatsEE", "tensorrt_llm::executor::Serialization::serialize::iterStatsVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::kvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::kvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::kvCacheStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::lookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::metrics"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::orchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK10LoraConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11DebugConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11EagleConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK11MropeConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12DecodingModeRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12KvCacheStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12OutputConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK13KvCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14DecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ExecutorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14IterationStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14SamplingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK16AdditionalOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18ContextPhaseParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18DynamicBatchConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18OrchestratorConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18PromptTuningConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK18RequestPerfMetricsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20DataTransceiverStateRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK20GuidedDecodingParamsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21AdditionalModelOutputRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK21InflightBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22CacheTransceiverConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22KvCacheRetentionConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK23LookaheadDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25ExternalDraftTokensConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK29ExtendedRuntimePerfKnobConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK33SpeculativeDecodingFastLogitsInfoRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK14ParallelConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::parallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15PeftCacheConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::peftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK7RequestRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::request"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStageRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::requestStage"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI24RequestStatsPerIterationEE", "tensorrt_llm::executor::Serialization::serialize::requestStatsVec"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK8ResponseRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::response"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKNSt6vectorI8ResponseEE", "tensorrt_llm::executor::Serialization::serialize::responses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6ResultRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::result"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK15SchedulerConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::schedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK25SpeculativeDecodingConfigRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::specDecConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK17SpecDecodingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::specDecStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK12RequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIteration", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK24RequestStatsPerIterationRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10AgentStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache10CacheStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache11SocketStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN8kv_cache9CommStateERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK19StaticBatchingStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::staticBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK22DisServingRequestStatsRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::stats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERK6TensorRNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::tensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::tokenRangeRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization9serializeERKN18RequestPerfMetrics9TimePointERNSt7ostreamE", "tensorrt_llm::executor::Serialization::serialize::tp"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK10LoraConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11DebugConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11EagleConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11MropeConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12DecodingMode", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12KvCacheStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12OutputConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStage", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK13KvCacheConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14DecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ExecutorConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14IterationStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ParallelConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14SamplingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15PeftCacheConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15SchedulerConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK16AdditionalOutput", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK17SpecDecodingStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18ContextPhaseParams", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18DynamicBatchConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18OrchestratorConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18PromptTuningConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18RequestPerfMetrics", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK19StaticBatchingStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20DataTransceiverState", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingParams", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21AdditionalModelOutput", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22CacheTransceiverConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22DisServingRequestStats", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22KvCacheRetentionConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK23LookaheadDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK24RequestStatsPerIteration", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25ExternalDraftTokensConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK33SpeculativeDecodingFastLogitsInfo", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Result", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Tensor", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK7Request", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK8Response", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN18RequestPerfMetrics9TimePointE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10AgentStateE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10CacheStateE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache9CommStateE", "tensorrt_llm::executor::Serialization::serializedSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21AdditionalModelOutput", "tensorrt_llm::executor::Serialization::serializedSize::additionalModelOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK16AdditionalOutput", "tensorrt_llm::executor::Serialization::serializedSize::additionalOutput"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22CacheTransceiverConfig", "tensorrt_llm::executor::Serialization::serializedSize::cacheTransceiverConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK10LoraConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11MropeConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12OutputConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14SamplingConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18PromptTuningConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25ExternalDraftTokensConfig", "tensorrt_llm::executor::Serialization::serializedSize::config"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18ContextPhaseParams", "tensorrt_llm::executor::Serialization::serializedSize::contextPhaseParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20DataTransceiverState", "tensorrt_llm::executor::Serialization::serializedSize::dataTransceiverState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11DebugConfig", "tensorrt_llm::executor::Serialization::serializedSize::debugConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14DecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize::decodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12DecodingMode", "tensorrt_llm::executor::Serialization::serializedSize::decodingMode"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22DisServingRequestStats", "tensorrt_llm::executor::Serialization::serializedSize::disServingRequestStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18DynamicBatchConfig", "tensorrt_llm::executor::Serialization::serializedSize::dynamicBatchConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK11EagleConfig", "tensorrt_llm::executor::Serialization::serializedSize::eagleConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ExecutorConfig", "tensorrt_llm::executor::Serialization::serializedSize::executorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK29ExtendedRuntimePerfKnobConfig", "tensorrt_llm::executor::Serialization::serializedSize::extendedRuntimePerfKnobConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize::guidedDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK20GuidedDecodingParams", "tensorrt_llm::executor::Serialization::serializedSize::guidedDecodingParams"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK21InflightBatchingStats", "tensorrt_llm::executor::Serialization::serializedSize::inflightBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK33SpeculativeDecodingFastLogitsInfo", "tensorrt_llm::executor::Serialization::serializedSize::info"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14IterationStats", "tensorrt_llm::executor::Serialization::serializedSize::iterStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK13KvCacheConfig", "tensorrt_llm::executor::Serialization::serializedSize::kvCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK22KvCacheRetentionConfig", "tensorrt_llm::executor::Serialization::serializedSize::kvCacheRetentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12KvCacheStats", "tensorrt_llm::executor::Serialization::serializedSize::kvCacheStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK23LookaheadDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize::lookaheadDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18RequestPerfMetrics", "tensorrt_llm::executor::Serialization::serializedSize::metrics"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK18OrchestratorConfig", "tensorrt_llm::executor::Serialization::serializedSize::orchestratorConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK14ParallelConfig", "tensorrt_llm::executor::Serialization::serializedSize::parallelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15PeftCacheConfig", "tensorrt_llm::executor::Serialization::serializedSize::peftCacheConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK7Request", "tensorrt_llm::executor::Serialization::serializedSize::request"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStage", "tensorrt_llm::executor::Serialization::serializedSize::requestStage"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK8Response", "tensorrt_llm::executor::Serialization::serializedSize::response"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Result", "tensorrt_llm::executor::Serialization::serializedSize::result"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK15SchedulerConfig", "tensorrt_llm::executor::Serialization::serializedSize::schedulerConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::Serialization::serializedSize::specDecConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK17SpecDecodingStats", "tensorrt_llm::executor::Serialization::serializedSize::specDecStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK12RequestStats", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK24RequestStatsPerIteration", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10AgentStateE", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache10CacheStateE", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache11SocketStateE", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN8kv_cache9CommStateE", "tensorrt_llm::executor::Serialization::serializedSize::state"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK19StaticBatchingStats", "tensorrt_llm::executor::Serialization::serializedSize::staticBatchingStats"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERK6Tensor", "tensorrt_llm::executor::Serialization::serializedSize::tensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor13Serialization14serializedSizeERKN22KvCacheRetentionConfig25TokenRangeRetentionConfigE", "tensorrt_llm::executor::Serialization::serializedSize::tokenRangeRetentionConfig"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor5ShapeE", "tensorrt_llm::executor::Shape"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor5Shape4BaseE", "tensorrt_llm::executor::Shape::Base"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor5Shape9DimType64E", "tensorrt_llm::executor::Shape::DimType64"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeENSt16initializer_listI9DimType64EE", "tensorrt_llm::executor::Shape::Shape"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEPK9DimType64N4Base9size_typeE", "tensorrt_llm::executor::Shape::Shape"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEv", "tensorrt_llm::executor::Shape::Shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEPK9DimType64N4Base9size_typeE", "tensorrt_llm::executor::Shape::Shape::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeENSt16initializer_listI9DimType64EE", "tensorrt_llm::executor::Shape::Shape::dims"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor5Shape5ShapeEPK9DimType64N4Base9size_typeE", "tensorrt_llm::executor::Shape::Shape::size"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor10SizeType32E", "tensorrt_llm::executor::SizeType32"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor10SizeType64E", "tensorrt_llm::executor::SizeType64"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStatsE", "tensorrt_llm::executor::SpecDecodingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats16acceptanceLengthE", "tensorrt_llm::executor::SpecDecodingStats::acceptanceLength"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13draftOverheadE", "tensorrt_llm::executor::SpecDecodingStats::draftOverhead"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats13iterLatencyMSE", "tensorrt_llm::executor::SpecDecodingStats::iterLatencyMS"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats17numAcceptedTokensE", "tensorrt_llm::executor::SpecDecodingStats::numAcceptedTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats14numDraftTokensE", "tensorrt_llm::executor::SpecDecodingStats::numDraftTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor17SpecDecodingStats26numRequestsWithDraftTokensE", "tensorrt_llm::executor::SpecDecodingStats::numRequestsWithDraftTokens"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfigE", "tensorrt_llm::executor::SpeculativeDecodingConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig25SpeculativeDecodingConfigEb", "tensorrt_llm::executor::SpeculativeDecodingConfig::SpeculativeDecodingConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig25SpeculativeDecodingConfigEb", "tensorrt_llm::executor::SpeculativeDecodingConfig::SpeculativeDecodingConfig::fastLogits"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor25SpeculativeDecodingConfig10fastLogitsE", "tensorrt_llm::executor::SpeculativeDecodingConfig::fastLogits"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor25SpeculativeDecodingConfigeqERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::SpeculativeDecodingConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor25SpeculativeDecodingConfigeqERK25SpeculativeDecodingConfig", "tensorrt_llm::executor::SpeculativeDecodingConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfoE", "tensorrt_llm::executor::SpeculativeDecodingFastLogitsInfo"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo18draftParticipantIdE", "tensorrt_llm::executor::SpeculativeDecodingFastLogitsInfo::draftParticipantId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo14draftRequestIdE", "tensorrt_llm::executor::SpeculativeDecodingFastLogitsInfo::draftRequestId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor33SpeculativeDecodingFastLogitsInfo8toTensorEv", "tensorrt_llm::executor::SpeculativeDecodingFastLogitsInfo::toTensor"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStatsE", "tensorrt_llm::executor::StaticBatchingStats"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats13emptyGenSlotsE", "tensorrt_llm::executor::StaticBatchingStats::emptyGenSlots"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats18numContextRequestsE", "tensorrt_llm::executor::StaticBatchingStats::numContextRequests"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats12numCtxTokensE", "tensorrt_llm::executor::StaticBatchingStats::numCtxTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats12numGenTokensE", "tensorrt_llm::executor::StaticBatchingStats::numGenTokens"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor19StaticBatchingStats20numScheduledRequestsE", "tensorrt_llm::executor::StaticBatchingStats::numScheduledRequests"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor9StreamPtrE", "tensorrt_llm::executor::StreamPtr"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor6TensorE", "tensorrt_llm::executor::Tensor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::CudaStreamPtr"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6Tensor4ImplE", "tensorrt_llm::executor::Tensor::Impl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::Tensor::Tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERK6Tensor", "tensorrt_llm::executor::Tensor::Tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERR6Tensor", "tensorrt_llm::executor::Tensor::Tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorEv", "tensorrt_llm::executor::Tensor::Tensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERK6Tensor", "tensorrt_llm::executor::Tensor::Tensor::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorERR6Tensor", "tensorrt_llm::executor::Tensor::Tensor::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6TensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::Tensor::Tensor::tensor"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor6copyToENSt10shared_ptrI4ImplEE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::copyTo"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor6copyToENSt10shared_ptrI4ImplEE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::copyTo::stream"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor6copyToENSt10shared_ptrI4ImplEE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::copyTo::tensor"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToCpuEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToCpu"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToCpuEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToCpu::stream"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToGpuEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToGpu"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor9copyToGpuEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToGpu::stream"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor13copyToManagedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToManaged"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor13copyToManagedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToManaged::stream"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor12copyToPinnedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToPinned"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor12copyToPinnedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToPinned::stream"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor18copyToPooledPinnedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToPooledPinned"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor18copyToPooledPinnedEN6Tensor13CudaStreamPtrE", "tensorrt_llm::executor::Tensor::copyToPooledPinned::stream"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3cpuE6Tensor5Shape", "tensorrt_llm::executor::Tensor::cpu"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3cpuE8DataType5Shape", "tensorrt_llm::executor::Tensor::cpu"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3cpuE6Tensor5Shape", "tensorrt_llm::executor::Tensor::cpu::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3cpuE8DataType5Shape", "tensorrt_llm::executor::Tensor::cpu::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3cpuE6Tensor5Shape", "tensorrt_llm::executor::Tensor::cpu::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3cpuE8DataType5Shape", "tensorrt_llm::executor::Tensor::cpu::shape"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::Tensor::detail::ofITensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::Tensor::detail::ofITensor::tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9toITensorERK6Tensor", "tensorrt_llm::executor::Tensor::detail::toITensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6detail9toITensorERK6Tensor", "tensorrt_llm::executor::Tensor::detail::toITensor::tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7getDataEv", "tensorrt_llm::executor::Tensor::getData"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor7getDataEv", "tensorrt_llm::executor::Tensor::getData"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor11getDataTypeEv", "tensorrt_llm::executor::Tensor::getDataType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor13getMemoryTypeEv", "tensorrt_llm::executor::Tensor::getMemoryType"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor14getRuntimeTypeE8DataTypev", "tensorrt_llm::executor::Tensor::getRuntimeType"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor14getRuntimeTypeE8DataTypev", "tensorrt_llm::executor::Tensor::getRuntimeType::T"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor8getShapeEv", "tensorrt_llm::executor::Tensor::getShape"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor7getSizeEv", "tensorrt_llm::executor::Tensor::getSize"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6Tensor14getSizeInBytesEv", "tensorrt_llm::executor::Tensor::getSizeInBytes"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::shape"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor3gpuE6Tensor13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::stream"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor3gpuE8DataType13CudaStreamPtr5Shape", "tensorrt_llm::executor::Tensor::gpu::stream"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7mTensorE", "tensorrt_llm::executor::Tensor::mTensor"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor7managedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::managed"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7managedE8DataType5Shape", "tensorrt_llm::executor::Tensor::managed"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor7managedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::managed::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7managedE8DataType5Shape", "tensorrt_llm::executor::Tensor::managed::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor7managedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::managed::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7managedE8DataType5Shape", "tensorrt_llm::executor::Tensor::managed::shape"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", "tensorrt_llm::executor::Tensor::of"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorR1T", "tensorrt_llm::executor::Tensor::of"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", "tensorrt_llm::executor::Tensor::of"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", "tensorrt_llm::executor::Tensor::of::T"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorR1T", "tensorrt_llm::executor::Tensor::of::T"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", "tensorrt_llm::executor::Tensor::of::data"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorR1T", "tensorrt_llm::executor::Tensor::of::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", "tensorrt_llm::executor::Tensor::of::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", "tensorrt_llm::executor::Tensor::of::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor2ofE6TensorP1T5Shape", "tensorrt_llm::executor::Tensor::of::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor2ofE8DataTypePv5Shape", "tensorrt_llm::executor::Tensor::of::shape"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6TensorcvbEv", "tensorrt_llm::executor::Tensor::operator bool"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6TensorneERK6Tensor", "tensorrt_llm::executor::Tensor::operator!="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6TensorneERK6Tensor", "tensorrt_llm::executor::Tensor::operator!=::rhs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6TensoraSERK6Tensor", "tensorrt_llm::executor::Tensor::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6TensoraSERR6Tensor", "tensorrt_llm::executor::Tensor::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6TensoraSERK6Tensor", "tensorrt_llm::executor::Tensor::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6TensoraSERR6Tensor", "tensorrt_llm::executor::Tensor::operator=::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor6TensoreqERK6Tensor", "tensorrt_llm::executor::Tensor::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor6TensoreqERK6Tensor", "tensorrt_llm::executor::Tensor::operator==::rhs"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor6pinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pinned"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6pinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pinned"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor6pinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pinned::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6pinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pinned::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor6pinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pinned::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor6pinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pinned::shape"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor12pooledPinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pooledPinned"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor12pooledPinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pooledPinned"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor12pooledPinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pooledPinned::T"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor12pooledPinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pooledPinned::dataType"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor6Tensor12pooledPinnedE6Tensor5Shape", "tensorrt_llm::executor::Tensor::pooledPinned::shape"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor12pooledPinnedE8DataType5Shape", "tensorrt_llm::executor::Tensor::pooledPinned::shape"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setFromERK6Tensor13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setFrom"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setFromERK6Tensor13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setFrom::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setFromERK6Tensor13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setFrom::stream"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setZeroE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setZero"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6Tensor7setZeroE13CudaStreamPtr", "tensorrt_llm::executor::Tensor::setZero::stream"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6TensorD0Ev", "tensorrt_llm::executor::Tensor::~Tensor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor9TensorPtrE", "tensorrt_llm::executor::TensorPtr"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor11TokenIdTypeE", "tensorrt_llm::executor::TokenIdType"], [0, 2, 1, "_CPPv4I0_bEN12tensorrt_llm8executor10TypeTraitsE", "tensorrt_llm::executor::TypeTraits"], [0, 8, 1, "_CPPv4I0_bEN12tensorrt_llm8executor10TypeTraitsE", "tensorrt_llm::executor::TypeTraits::T"], [0, 2, 1, "_CPPv4I0EN12tensorrt_llm8executor10TypeTraitsIP1TEE", "tensorrt_llm::executor::TypeTraits&lt;T*&gt;"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor10TypeTraitsIP1TEE", "tensorrt_llm::executor::TypeTraits&lt;T*&gt;::T"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIP1TE5valueE", "tensorrt_llm::executor::TypeTraits&lt;T*&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsIbEE", "tensorrt_llm::executor::TypeTraits&lt;bool&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIbE5valueE", "tensorrt_llm::executor::TypeTraits&lt;bool&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsIfEE", "tensorrt_llm::executor::TypeTraits&lt;float&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsIfE5valueE", "tensorrt_llm::executor::TypeTraits&lt;float&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsI4halfEE", "tensorrt_llm::executor::TypeTraits&lt;half&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsI4halfE5valueE", "tensorrt_llm::executor::TypeTraits&lt;half&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7int32_tEEE", "tensorrt_llm::executor::TypeTraits&lt;std::int32_t&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7int32_tEE5valueE", "tensorrt_llm::executor::TypeTraits&lt;std::int32_t&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7int64_tEEE", "tensorrt_llm::executor::TypeTraits&lt;std::int64_t&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7int64_tEE5valueE", "tensorrt_llm::executor::TypeTraits&lt;std::int64_t&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt6int8_tEEE", "tensorrt_llm::executor::TypeTraits&lt;std::int8_t&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt6int8_tEE5valueE", "tensorrt_llm::executor::TypeTraits&lt;std::int8_t&gt;::value"], [0, 2, 1, "_CPPv4IEN12tensorrt_llm8executor10TypeTraitsINSt7uint8_tEEE", "tensorrt_llm::executor::TypeTraits&lt;std::uint8_t&gt;"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor10TypeTraitsINSt7uint8_tEE5valueE", "tensorrt_llm::executor::TypeTraits&lt;std::uint8_t&gt;::value"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor11VecLogProbsE", "tensorrt_llm::executor::VecLogProbs"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor16VecTokenExtraIdsE", "tensorrt_llm::executor::VecTokenExtraIds"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor9VecTokensE", "tensorrt_llm::executor::VecTokens"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6detailE", "tensorrt_llm::executor::detail"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor6detail9DimType64E", "tensorrt_llm::executor::detail::DimType64"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::detail::ofITensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6detail9ofITensorENSt10shared_ptrIN7runtime7ITensorEEE", "tensorrt_llm::executor::detail::ofITensor::tensor"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor6detail9toITensorERK6Tensor", "tensorrt_llm::executor::detail::toITensor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor6detail9toITensorERK6Tensor", "tensorrt_llm::executor::detail::toITensor::tensor"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executorE", "tensorrt_llm::executor::disagg_executor"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestratorE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::ctxEnginePaths"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::ctxExecutorConfigs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::genEnginePaths"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::genExecutorConfigs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::hasContextAwaitThreads"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator26DisaggExecutorOrchestratorERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorINSt10filesystem4pathEEERKNSt6vectorIN8executor14ExecutorConfigEEERKNSt6vectorIN8executor14ExecutorConfigEEEbb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::DisaggExecutorOrchestrator::hasGenAwaitThreads"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator21awaitContextResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitContextResponses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator21awaitContextResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitContextResponses::contextIdx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator21awaitContextResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitContextResponses::timeout"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator24awaitGenerationResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitGenerationResponses"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator24awaitGenerationResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitGenerationResponses::genIdx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator24awaitGenerationResponsesERKNSt8optionalINSt6chrono12millisecondsEEENSt8optionalIiEE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::awaitGenerationResponses::timeout"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator10canEnqueueEv", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::canEnqueue"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueContext::batch"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueContext::requests"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator14enqueueContextERKNSt6vectorIN5texec7RequestEEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueContext::selectContextId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration::batch"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration::globalRequestIds"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration::requests"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator17enqueueGenerationERKNSt6vectorIN5texec7RequestEEERKNSt6vectorI6IdTypeEENSt8optionalIiEEb", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::enqueueGeneration::selectGenIdx"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator19getContextExecutorsEv", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::getContextExecutors"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator15getGenExecutorsEv", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::getGenExecutors"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestrator5mImplE", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::mImpl"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor26DisaggExecutorOrchestratorD0Ev", "tensorrt_llm::executor::disagg_executor::DisaggExecutorOrchestrator::~DisaggExecutorOrchestrator"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdE", "tensorrt_llm::executor::disagg_executor::ResponseWithId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERK14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERKN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERR14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERRN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERKN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::gid"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERRN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::gid"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERK14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERR14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERKN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::response"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId14ResponseWithIdERRN12tensorrt_llm8executor8ResponseE6IdType", "tensorrt_llm::executor::disagg_executor::ResponseWithId::ResponseWithId::response"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId3gidE", "tensorrt_llm::executor::disagg_executor::ResponseWithId::gid"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERK14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERR14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::operator="], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERK14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::operator=::other"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdaSERR14ResponseWithId", "tensorrt_llm::executor::disagg_executor::ResponseWithId::operator=::other"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithId8responseE", "tensorrt_llm::executor::disagg_executor::ResponseWithId::response"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor15disagg_executor14ResponseWithIdD0Ev", "tensorrt_llm::executor::disagg_executor::ResponseWithId::~ResponseWithId"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cacheE", "tensorrt_llm::executor::kv_cache"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cacheE", "tensorrt_llm::executor::kv_cache"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cacheE", "tensorrt_llm::executor::kv_cache"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cacheE", "tensorrt_llm::executor::kv_cache"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDescE", "tensorrt_llm::executor::kv_cache::AgentDesc"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc9AgentDescENSt6stringE", "tensorrt_llm::executor::kv_cache::AgentDesc::AgentDesc"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc9AgentDescENSt6stringE", "tensorrt_llm::executor::kv_cache::AgentDesc::AgentDesc::backendAgentDesc"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9AgentDesc19getBackendAgentDescEv", "tensorrt_llm::executor::kv_cache::AgentDesc::getBackendAgentDesc"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9AgentDesc17mBackendAgentDescE", "tensorrt_llm::executor::kv_cache::AgentDesc::mBackendAgentDesc"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentStateE", "tensorrt_llm::executor::kv_cache::AgentState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE", "tensorrt_llm::executor::kv_cache::AgentState::AgentState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateEv", "tensorrt_llm::executor::kv_cache::AgentState::AgentState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE", "tensorrt_llm::executor::kv_cache::AgentState::AgentState::agentName"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10AgentStateENSt6stringENSt6stringE", "tensorrt_llm::executor::kv_cache::AgentState::AgentState::connectionInfo"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState10mAgentNameE", "tensorrt_llm::executor::kv_cache::AgentState::mAgentName"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10AgentState15mConnectionInfoE", "tensorrt_llm::executor::kv_cache::AgentState::mConnectionInfo"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentStateeqERK10AgentState", "tensorrt_llm::executor::kv_cache::AgentState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentStateeqERK10AgentState", "tensorrt_llm::executor::kv_cache::AgentState::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10AgentState8toStringEv", "tensorrt_llm::executor::kv_cache::AgentState::toString"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfigE", "tensorrt_llm::executor::kv_cache::BaseAgentConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig5mNameE", "tensorrt_llm::executor::kv_cache::BaseAgentConfig::mName"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15BaseAgentConfig13useProgThreadE", "tensorrt_llm::executor::kv_cache::BaseAgentConfig::useProgThread"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentE", "tensorrt_llm::executor::kv_cache::BaseTransferAgent"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::checkRemoteDescs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::checkRemoteDescs::memoryDescs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16checkRemoteDescsERKNSt6stringERK11MemoryDescs", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::checkRemoteDescs::name"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::connectRemoteAgent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::connectRemoteAgent::connectionInfo"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent18connectRemoteAgentERKNSt6stringERK18ConnectionInfoType", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::connectRemoteAgent::name"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16deregisterMemoryERK13RegisterDescs", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::deregisterMemory"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent16deregisterMemoryERK13RegisterDescs", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::deregisterMemory::descs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getConnectionInfoEv", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::getConnectionInfo"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17getLocalAgentDescEv", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::getLocalAgentDesc"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent23getNotifiedSyncMessagesEv", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::getNotifiedSyncMessages"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent21invalidateRemoteAgentERKNSt6stringE", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::invalidateRemoteAgent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent21invalidateRemoteAgentERKNSt6stringE", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::invalidateRemoteAgent::name"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::loadRemoteAgent"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::loadRemoteAgent::agentDesc"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent15loadRemoteAgentERKNSt6stringERK9AgentDesc", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::loadRemoteAgent::name"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::notifySyncMessage"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::notifySyncMessage::name"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent17notifySyncMessageERKNSt6stringERK11SyncMessage", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::notifySyncMessage::syncMessage"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent14registerMemoryERK13RegisterDescs", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::registerMemory"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent14registerMemoryERK13RegisterDescs", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::registerMemory::descs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent22submitTransferRequestsERK15TransferRequest", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::submitTransferRequests"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgent22submitTransferRequestsERK15TransferRequest", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::submitTransferRequests::request"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17BaseTransferAgentD0Ev", "tensorrt_llm::executor::kv_cache::BaseTransferAgent::~BaseTransferAgent"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheStateE", "tensorrt_llm::executor::kv_cache::CacheState"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfigE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig15AttentionConfigE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::AttentionConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig15AttentionConfigE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::AttentionConfig::attentionType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig15AttentionConfigE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::AttentionConfig::kvFactor"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig14mAttentionTypeE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::mAttentionType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15AttentionConfig9mKvFactorE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionConfig::mKvFactor"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionTypeE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionType8kDEFAULTE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionType::kDEFAULT"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState13AttentionType4kMLAE", "tensorrt_llm::executor::kv_cache::CacheState::AttentionType::kMLA"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::DPrank"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::DPrank"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::DPsize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::DPsize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::attentionType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::attentionType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::attentionType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::dataType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::dataType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::dataType"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::enableAttentionDP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::enableAttentionDP"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::kvFactor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::kvFactor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::kvFactor"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::modelConfig"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::nbAttentionLayers"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::nbKvHeadPerLayer"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::nbKvHeads"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::pipelineParallelism"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::pipelineParallelism"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::sizePerHead"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::sizePerHead"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::tensorParallelism"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::tensorParallelism"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::tokensPerBlock"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateENSt6vectorI10SizeType32EE10SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE13AttentionTypeibii", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::tokensPerBlock"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState10CacheStateE11ModelConfigRKN7runtime11WorldConfigEN8nvinfer18DataTypeE13AttentionTypei", "tensorrt_llm::executor::kv_cache::CacheState::CacheState::worldConfig"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigE", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig18mNbKvHeadsPerLayerE", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::mNbKvHeadsPerLayer"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig12mSizePerHeadE", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::mSizePerHead"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState11ModelConfig15mTokensPerBlockE", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::mTokensPerBlock"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigeqERK11ModelConfig", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11ModelConfigeqERK11ModelConfig", "tensorrt_llm::executor::kv_cache::CacheState::ModelConfig::operator==::other"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig7mDPrankE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mDPrank"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig7mDPsizeE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mDPsize"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig18mEnableAttentionDPE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mEnableAttentionDP"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig20mPipelineParallelismE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mPipelineParallelism"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfig18mTensorParallelismE", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::mTensorParallelism"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigeqERK14ParallelConfig", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14ParallelConfigeqERK14ParallelConfig", "tensorrt_llm::executor::kv_cache::CacheState::ParallelConfig::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState18getAttentionConfigEv", "tensorrt_llm::executor::kv_cache::CacheState::getAttentionConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState11getDataTypeEv", "tensorrt_llm::executor::kv_cache::CacheState::getDataType"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState14getModelConfigEv", "tensorrt_llm::executor::kv_cache::CacheState::getModelConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState17getParallelConfigEv", "tensorrt_llm::executor::kv_cache::CacheState::getParallelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState16mAttentionConfigE", "tensorrt_llm::executor::kv_cache::CacheState::mAttentionConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState9mDataTypeE", "tensorrt_llm::executor::kv_cache::CacheState::mDataType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState12mModelConfigE", "tensorrt_llm::executor::kv_cache::CacheState::mModelConfig"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10CacheState15mParallelConfigE", "tensorrt_llm::executor::kv_cache::CacheState::mParallelConfig"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheStateeqERKN8kv_cache10CacheStateE", "tensorrt_llm::executor::kv_cache::CacheState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheStateeqERKN8kv_cache10CacheStateE", "tensorrt_llm::executor::kv_cache::CacheState::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10CacheState8toStringEv", "tensorrt_llm::executor::kv_cache::CacheState::toString"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommStateE", "tensorrt_llm::executor::kv_cache::CommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi", "tensorrt_llm::executor::kv_cache::CommState::CommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi", "tensorrt_llm::executor::kv_cache::CommState::CommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi", "tensorrt_llm::executor::kv_cache::CommState::CommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE", "tensorrt_llm::executor::kv_cache::CommState::CommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateEv", "tensorrt_llm::executor::kv_cache::CommState::CommState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::agentState"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE", "tensorrt_llm::executor::kv_cache::CommState::CommState::ip"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt8uint16_tENSt6stringE", "tensorrt_llm::executor::kv_cache::CommState::CommState::port"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::ranks"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10AgentStateEEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::selfIdx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI10SizeType32EEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::selfIdx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::selfIdx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState9CommStateENSt6vectorI11SocketStateEEi", "tensorrt_llm::executor::kv_cache::CommState::CommState::socketState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13getAgentStateEv", "tensorrt_llm::executor::kv_cache::CommState::getAgentState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState11getMpiStateEv", "tensorrt_llm::executor::kv_cache::CommState::getMpiState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10getSelfIdxEv", "tensorrt_llm::executor::kv_cache::CommState::getSelfIdx"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState14getSocketStateEv", "tensorrt_llm::executor::kv_cache::CommState::getSocketState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState12isAgentStateEv", "tensorrt_llm::executor::kv_cache::CommState::isAgentState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState10isMpiStateEv", "tensorrt_llm::executor::kv_cache::CommState::isMpiState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState13isSocketStateEv", "tensorrt_llm::executor::kv_cache::CommState::isSocketState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState8mSelfIdxE", "tensorrt_llm::executor::kv_cache::CommState::mSelfIdx"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache9CommState6mStateE", "tensorrt_llm::executor::kv_cache::CommState::mState"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommStateeqERK9CommState", "tensorrt_llm::executor::kv_cache::CommState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommStateeqERK9CommState", "tensorrt_llm::executor::kv_cache::CommState::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache9CommState8toStringEv", "tensorrt_llm::executor::kv_cache::CommState::toString"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10ConnectionE", "tensorrt_llm::executor::kv_cache::Connection"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection12isThreadSafeEv", "tensorrt_llm::executor::kv_cache::Connection::isThreadSafe"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::Connection::recv"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::Connection::recv::ctx"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::Connection::recv::data"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4recvERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::Connection::recv::size"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", "tensorrt_llm::executor::kv_cache::Connection::send"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", "tensorrt_llm::executor::kv_cache::Connection::send::ctx"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", "tensorrt_llm::executor::kv_cache::Connection::send::data"], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10Connection4sendERK11DataContextPKv6size_t", "tensorrt_llm::executor::kv_cache::Connection::send::size"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10ConnectionD0Ev", "tensorrt_llm::executor::kv_cache::Connection::~Connection"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache18ConnectionInfoTypeE", "tensorrt_llm::executor::kv_cache::ConnectionInfoType"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManagerE", "tensorrt_llm::executor::kv_cache::ConnectionManager"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache17ConnectionManager12getCommStateEv", "tensorrt_llm::executor::kv_cache::ConnectionManager::getCommState"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager14getConnectionsERK9CommState", "tensorrt_llm::executor::kv_cache::ConnectionManager::getConnections"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager14getConnectionsERK9CommState", "tensorrt_llm::executor::kv_cache::ConnectionManager::getConnections::state"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::ConnectionManager::recvConnect"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::ConnectionManager::recvConnect::ctx"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::ConnectionManager::recvConnect::data"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManager11recvConnectERK11DataContextPv6size_t", "tensorrt_llm::executor::kv_cache::ConnectionManager::recvConnect::size"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache17ConnectionManagerD0Ev", "tensorrt_llm::executor::kv_cache::ConnectionManager::~ConnectionManager"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContextE", "tensorrt_llm::executor::kv_cache::DataContext"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext11DataContextEi", "tensorrt_llm::executor::kv_cache::DataContext::DataContext"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext11DataContextEi", "tensorrt_llm::executor::kv_cache::DataContext::DataContext::tag"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11DataContext6getTagEv", "tensorrt_llm::executor::kv_cache::DataContext::getTag"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11DataContext4mTagE", "tensorrt_llm::executor::kv_cache::DataContext::mTag"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderE", "tensorrt_llm::executor::kv_cache::DynLibLoader"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderERK12DynLibLoader", "tensorrt_llm::executor::kv_cache::DynLibLoader::DynLibLoader"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader12DynLibLoaderEv", "tensorrt_llm::executor::kv_cache::DynLibLoader::DynLibLoader"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc", "tensorrt_llm::executor::kv_cache::DynLibLoader::dlSym"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc", "tensorrt_llm::executor::kv_cache::DynLibLoader::dlSym::handle"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader5dlSymEPvPKc", "tensorrt_llm::executor::kv_cache::DynLibLoader::dlSym::symbol"], [0, 3, 1, "_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE", "tensorrt_llm::executor::kv_cache::DynLibLoader::getFunctionPointer"], [0, 8, 1, "_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE", "tensorrt_llm::executor::kv_cache::DynLibLoader::getFunctionPointer::FunctionT"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE", "tensorrt_llm::executor::kv_cache::DynLibLoader::getFunctionPointer::funcName"], [0, 4, 1, "_CPPv4I0EN12tensorrt_llm8executor8kv_cache12DynLibLoader18getFunctionPointerE9FunctionTRKNSt6stringERKNSt6stringE", "tensorrt_llm::executor::kv_cache::DynLibLoader::getFunctionPointer::libName"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9getHandleERKNSt6stringE", "tensorrt_llm::executor::kv_cache::DynLibLoader::getHandle"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9getHandleERKNSt6stringE", "tensorrt_llm::executor::kv_cache::DynLibLoader::getHandle::name"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader11getInstanceEv", "tensorrt_llm::executor::kv_cache::DynLibLoader::getInstance"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mDllMutexE", "tensorrt_llm::executor::kv_cache::DynLibLoader::mDllMutex"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoader9mHandlersE", "tensorrt_llm::executor::kv_cache::DynLibLoader::mHandlers"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderaSERK12DynLibLoader", "tensorrt_llm::executor::kv_cache::DynLibLoader::operator="], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache12DynLibLoaderD0Ev", "tensorrt_llm::executor::kv_cache::DynLibLoader::~DynLibLoader"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDescE", "tensorrt_llm::executor::kv_cache::MemoryDesc"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc::addr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc::addr"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc::deviceId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc::deviceId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc::deviceId"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescE9uintptr_t6size_t8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc::len"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescEPv6size_t8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc::len"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc10MemoryDescERKNSt6vectorIcEE8uint32_t", "tensorrt_llm::executor::kv_cache::MemoryDesc::MemoryDesc::vec"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc11deserializeERNSt7istreamE", "tensorrt_llm::executor::kv_cache::MemoryDesc::deserialize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc11deserializeERNSt7istreamE", "tensorrt_llm::executor::kv_cache::MemoryDesc::deserialize::is"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc7getAddrEv", "tensorrt_llm::executor::kv_cache::MemoryDesc::getAddr"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc11getDeviceIdEv", "tensorrt_llm::executor::kv_cache::MemoryDesc::getDeviceId"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache10MemoryDesc6getLenEv", "tensorrt_llm::executor::kv_cache::MemoryDesc::getLen"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc5mAddrE", "tensorrt_llm::executor::kv_cache::MemoryDesc::mAddr"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9mDeviceIdE", "tensorrt_llm::executor::kv_cache::MemoryDesc::mDeviceId"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc4mLenE", "tensorrt_llm::executor::kv_cache::MemoryDesc::mLen"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE", "tensorrt_llm::executor::kv_cache::MemoryDesc::serialize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE", "tensorrt_llm::executor::kv_cache::MemoryDesc::serialize::memoryDesc"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc9serializeERK10MemoryDescRNSt7ostreamE", "tensorrt_llm::executor::kv_cache::MemoryDesc::serialize::os"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc14serializedSizeERK10MemoryDesc", "tensorrt_llm::executor::kv_cache::MemoryDesc::serializedSize"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryDesc14serializedSizeERK10MemoryDesc", "tensorrt_llm::executor::kv_cache::MemoryDesc::serializedSize::memoryDesc"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescsE", "tensorrt_llm::executor::kv_cache::MemoryDescs"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE", "tensorrt_llm::executor::kv_cache::MemoryDescs::MemoryDescs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE", "tensorrt_llm::executor::kv_cache::MemoryDescs::MemoryDescs::descs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs11MemoryDescsE10MemoryTypeNSt6vectorI10MemoryDescEE", "tensorrt_llm::executor::kv_cache::MemoryDescs::MemoryDescs::type"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs8getDescsEv", "tensorrt_llm::executor::kv_cache::MemoryDescs::getDescs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11MemoryDescs7getTypeEv", "tensorrt_llm::executor::kv_cache::MemoryDescs::getType"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs6mDescsE", "tensorrt_llm::executor::kv_cache::MemoryDescs::mDescs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11MemoryDescs5mTypeE", "tensorrt_llm::executor::kv_cache::MemoryDescs::mType"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryTypeE", "tensorrt_llm::executor::kv_cache::MemoryType"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kBLKE", "tensorrt_llm::executor::kv_cache::MemoryType::kBLK"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kDRAME", "tensorrt_llm::executor::kv_cache::MemoryType::kDRAM"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kFILEE", "tensorrt_llm::executor::kv_cache::MemoryType::kFILE"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType4kOBJE", "tensorrt_llm::executor::kv_cache::MemoryType::kOBJ"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10MemoryType5kVRAME", "tensorrt_llm::executor::kv_cache::MemoryType::kVRAM"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache8MpiStateE", "tensorrt_llm::executor::kv_cache::MpiState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache8MpiState6mRanksE", "tensorrt_llm::executor::kv_cache::MpiState::mRanks"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiStateeqERK8MpiState", "tensorrt_llm::executor::kv_cache::MpiState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiStateeqERK8MpiState", "tensorrt_llm::executor::kv_cache::MpiState::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache8MpiState8toStringEv", "tensorrt_llm::executor::kv_cache::MpiState::toString"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache13RegisterDescsE", "tensorrt_llm::executor::kv_cache::RegisterDescs"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketStateE", "tensorrt_llm::executor::kv_cache::SocketState"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketState3mIpE", "tensorrt_llm::executor::kv_cache::SocketState::mIp"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11SocketState5mPortE", "tensorrt_llm::executor::kv_cache::SocketState::mPort"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketStateeqERK11SocketState", "tensorrt_llm::executor::kv_cache::SocketState::operator=="], [0, 4, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketStateeqERK11SocketState", "tensorrt_llm::executor::kv_cache::SocketState::operator==::other"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache11SocketState8toStringEv", "tensorrt_llm::executor::kv_cache::SocketState::toString"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache11SyncMessageE", "tensorrt_llm::executor::kv_cache::SyncMessage"], [0, 1, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache13TransferDescsE", "tensorrt_llm::executor::kv_cache::TransferDescs"], [0, 6, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOpE", "tensorrt_llm::executor::kv_cache::TransferOp"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp5kREADE", "tensorrt_llm::executor::kv_cache::TransferOp::kREAD"], [0, 7, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache10TransferOp6kWRITEE", "tensorrt_llm::executor::kv_cache::TransferOp::kWRITE"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequestE", "tensorrt_llm::executor::kv_cache::TransferRequest"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE", "tensorrt_llm::executor::kv_cache::TransferRequest::TransferRequest"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE", "tensorrt_llm::executor::kv_cache::TransferRequest::TransferRequest::dstDescs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE", "tensorrt_llm::executor::kv_cache::TransferRequest::TransferRequest::op"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE", "tensorrt_llm::executor::kv_cache::TransferRequest::TransferRequest::remoteName"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE", "tensorrt_llm::executor::kv_cache::TransferRequest::TransferRequest::srcDescs"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest15TransferRequestE10TransferOp13TransferDescs13TransferDescsRKNSt6stringENSt8optionalI11SyncMessageEE", "tensorrt_llm::executor::kv_cache::TransferRequest::TransferRequest::syncMessage"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getDstDescsEv", "tensorrt_llm::executor::kv_cache::TransferRequest::getDstDescs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest5getOpEv", "tensorrt_llm::executor::kv_cache::TransferRequest::getOp"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest13getRemoteNameEv", "tensorrt_llm::executor::kv_cache::TransferRequest::getRemoteName"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest11getSrcDescsEv", "tensorrt_llm::executor::kv_cache::TransferRequest::getSrcDescs"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache15TransferRequest14getSyncMessageEv", "tensorrt_llm::executor::kv_cache::TransferRequest::getSyncMessage"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mDstDescsE", "tensorrt_llm::executor::kv_cache::TransferRequest::mDstDescs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest3mOpE", "tensorrt_llm::executor::kv_cache::TransferRequest::mOp"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest11mRemoteNameE", "tensorrt_llm::executor::kv_cache::TransferRequest::mRemoteName"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest9mSrcDescsE", "tensorrt_llm::executor::kv_cache::TransferRequest::mSrcDescs"], [0, 5, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache15TransferRequest12mSyncMessageE", "tensorrt_llm::executor::kv_cache::TransferRequest::mSyncMessage"], [0, 2, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusE", "tensorrt_llm::executor::kv_cache::TransferStatus"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus11isCompletedEv", "tensorrt_llm::executor::kv_cache::TransferStatus::isCompleted"], [0, 3, 1, "_CPPv4NK12tensorrt_llm8executor8kv_cache14TransferStatus4waitEv", "tensorrt_llm::executor::kv_cache::TransferStatus::wait"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor8kv_cache14TransferStatusD0Ev", "tensorrt_llm::executor::kv_cache::TransferStatus::~TransferStatus"], [0, 3, 1, "_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args", "tensorrt_llm::executor::kv_cache::makeTransferAgent"], [0, 8, 1, "_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args", "tensorrt_llm::executor::kv_cache::makeTransferAgent::Args"], [0, 4, 1, "_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args", "tensorrt_llm::executor::kv_cache::makeTransferAgent::args"], [0, 4, 1, "_CPPv4IDpEN12tensorrt_llm8executor8kv_cache17makeTransferAgentENSt10unique_ptrI17BaseTransferAgentEERKNSt6stringEDpRR4Args", "tensorrt_llm::executor::kv_cache::makeTransferAgent::backend"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE21ContextChunkingPolicy", "tensorrt_llm::executor::operator&lt;&lt;"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE23CapacitySchedulerPolicy", "tensorrt_llm::executor::operator&lt;&lt;"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE21ContextChunkingPolicy", "tensorrt_llm::executor::operator&lt;&lt;::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE23CapacitySchedulerPolicy", "tensorrt_llm::executor::operator&lt;&lt;::os"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE21ContextChunkingPolicy", "tensorrt_llm::executor::operator&lt;&lt;::policy"], [0, 4, 1, "_CPPv4N12tensorrt_llm8executorlsERNSt7ostreamE23CapacitySchedulerPolicy", "tensorrt_llm::executor::operator&lt;&lt;::policy"], [0, 3, 1, "_CPPv4N12tensorrt_llm8executor7versionEv", "tensorrt_llm::executor::version"], [1, 1, 1, "_CPPv4N12tensorrt_llm6layersE", "tensorrt_llm::layers"], [0, 1, 1, "_CPPv4N12tensorrt_llm3mpiE", "tensorrt_llm::mpi"], [0, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [0, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtimeE", "tensorrt_llm::runtime"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffersE", "tensorrt_llm::runtime::AllReduceBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::fakeBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::hiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers16AllReduceBuffersE10SizeType3210SizeType3210SizeType3210SizeType32RK13BufferManagerRK11WorldConfigKb", "tensorrt_llm::runtime::AllReduceBuffers::AllReduceBuffers::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9TensorPtrE", "tensorrt_llm::runtime::AllReduceBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers18mAllReduceCommPtrsE", "tensorrt_llm::runtime::AllReduceBuffers::mAllReduceCommPtrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers9mFlagPtrsE", "tensorrt_llm::runtime::AllReduceBuffers::mFlagPtrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16AllReduceBuffers17mIpcMemoryHandlesE", "tensorrt_llm::runtime::AllReduceBuffers::mIpcMemoryHandles"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataTypeE", "tensorrt_llm::runtime::BufferDataType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", "tensorrt_llm::runtime::BufferDataType::BufferDataType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", "tensorrt_llm::runtime::BufferDataType::BufferDataType::_unsigned"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", "tensorrt_llm::runtime::BufferDataType::BufferDataType::dataType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType14BufferDataTypeEN8nvinfer18DataTypeEbb", "tensorrt_llm::runtime::BufferDataType::BufferDataType::pointer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType11getDataTypeEv", "tensorrt_llm::runtime::BufferDataType::getDataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType7getSizeEv", "tensorrt_llm::runtime::BufferDataType::getSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType13getSizeInBitsEv", "tensorrt_llm::runtime::BufferDataType::getSizeInBits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType9isPointerEv", "tensorrt_llm::runtime::BufferDataType::isPointer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataType10isUnsignedEv", "tensorrt_llm::runtime::BufferDataType::isUnsigned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType15kTrtPointerTypeE", "tensorrt_llm::runtime::BufferDataType::kTrtPointerType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType9mDataTypeE", "tensorrt_llm::runtime::BufferDataType::mDataType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType8mPointerE", "tensorrt_llm::runtime::BufferDataType::mPointer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14BufferDataType9mUnsignedE", "tensorrt_llm::runtime::BufferDataType::mUnsigned"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14BufferDataTypecvN8nvinfer18DataTypeEEv", "tensorrt_llm::runtime::BufferDataType::operator nvinfer1::DataType"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManagerE", "tensorrt_llm::runtime::BufferManager"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13BufferManagerE13CudaStreamPtrb", "tensorrt_llm::runtime::BufferManager::BufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13BufferManagerE13CudaStreamPtrb", "tensorrt_llm::runtime::BufferManager::BufferManager::stream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13BufferManagerE13CudaStreamPtrb", "tensorrt_llm::runtime::BufferManager::BufferManager::trimPool"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager14CudaMemPoolPtrE", "tensorrt_llm::runtime::BufferManager::CudaMemPoolPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager13CudaStreamPtrE", "tensorrt_llm::runtime::BufferManager::CudaStreamPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10IBufferPtrE", "tensorrt_llm::runtime::BufferManager::IBufferPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10ITensorPtrE", "tensorrt_llm::runtime::BufferManager::ITensorPtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::dims"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::size"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::type"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8allocateE10MemoryTypeNSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::allocate::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv", "tensorrt_llm::runtime::BufferManager::copy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", "tensorrt_llm::runtime::BufferManager::copy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy::dst"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::dstType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferPv10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyERK7IBufferR7IBuffer", "tensorrt_llm::runtime::BufferManager::copy::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager4copyEPKvR7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copy::srcType"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7ITensor10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::T"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::T"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::T"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::dims"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::dims"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7ITensor10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::memoryType"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10IBufferPtrRKNSt6vectorI1TEE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrP1TN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime13BufferManager8copyFromE10ITensorPtrRKNSt6vectorI1TEEN8nvinfer14DimsE10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7IBuffer10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager8copyFromERK7ITensor10MemoryType", "tensorrt_llm::runtime::BufferManager::copyFrom::src"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager3cpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::cpu::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyBufferE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyBuffer"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyBufferE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyBuffer::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyBufferE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyBuffer::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyTensorE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyTensor"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyTensorE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyTensor::memoryType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager11emptyTensorE10MemoryTypeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::emptyTensor::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager9getStreamEv", "tensorrt_llm::runtime::BufferManager::getStream"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu::dims"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu::size"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu::type"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager3gpuENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpu::type"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7gpuSyncENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::gpuSync::type"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::ipcNvls"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::ipcNvls::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::ipcNvls::ranks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7ipcNvlsENSt3setIiEEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::ipcNvls::type"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10kBYTE_TYPEE", "tensorrt_llm::runtime::BufferManager::kBYTE_TYPE"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager5mPoolE", "tensorrt_llm::runtime::BufferManager::mPool"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7mStreamE", "tensorrt_llm::runtime::BufferManager::mStream"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager9mTrimPoolE", "tensorrt_llm::runtime::BufferManager::mTrimPool"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager7managedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::managed::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager14memoryPoolFreeEv", "tensorrt_llm::runtime::BufferManager::memoryPoolFree"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager18memoryPoolReservedEv", "tensorrt_llm::runtime::BufferManager::memoryPoolReserved"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager16memoryPoolTrimToENSt6size_tE", "tensorrt_llm::runtime::BufferManager::memoryPoolTrimTo"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager16memoryPoolTrimToENSt6size_tE", "tensorrt_llm::runtime::BufferManager::memoryPoolTrimTo::size"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager14memoryPoolUsedEv", "tensorrt_llm::runtime::BufferManager::memoryPoolUsed"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager6pinnedENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinned::type"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolEN8nvinfer14DimsEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManager10pinnedPoolENSt6size_tEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::BufferManager::pinnedPool::type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager6setMemER7IBuffer7int32_t", "tensorrt_llm::runtime::BufferManager::setMem"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager6setMemER7IBuffer7int32_t", "tensorrt_llm::runtime::BufferManager::setMem::buffer"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager6setMemER7IBuffer7int32_t", "tensorrt_llm::runtime::BufferManager::setMem::value"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager7setZeroER7IBuffer", "tensorrt_llm::runtime::BufferManager::setZero"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13BufferManager7setZeroER7IBuffer", "tensorrt_llm::runtime::BufferManager::setZero::buffer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13BufferManagerD0Ev", "tensorrt_llm::runtime::BufferManager::~BufferManager"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime11BufferRangeE", "tensorrt_llm::runtime::BufferRange"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime11BufferRange4BaseE", "tensorrt_llm::runtime::BufferRange::Base"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI1UEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeERK7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tIXntNSt10is_const_vI1UEEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeER7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11BufferRange11BufferRangeEP1T9size_type", "tensorrt_llm::runtime::BufferRange::BufferRange"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI1UEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeERK7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange::U"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tIXntNSt10is_const_vI1UEEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeER7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange::U"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI1UEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeERK7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange::buffer"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tIXntNSt10is_const_vI1UEEEbEEEN12tensorrt_llm7runtime11BufferRange11BufferRangeER7IBuffer", "tensorrt_llm::runtime::BufferRange::BufferRange::buffer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11BufferRange11BufferRangeEP1T9size_type", "tensorrt_llm::runtime::BufferRange::BufferRange::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11BufferRange11BufferRangeEP1T9size_type", "tensorrt_llm::runtime::BufferRange::BufferRange::size"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime11BufferRangeE", "tensorrt_llm::runtime::BufferRange::T"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEventE", "tensorrt_llm::runtime::CudaEvent"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventE7pointerb", "tensorrt_llm::runtime::CudaEvent::CudaEvent"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventEj", "tensorrt_llm::runtime::CudaEvent::CudaEvent"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventE7pointerb", "tensorrt_llm::runtime::CudaEvent::CudaEvent::event"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventEj", "tensorrt_llm::runtime::CudaEvent::CudaEvent::flags"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent9CudaEventE7pointerb", "tensorrt_llm::runtime::CudaEvent::CudaEvent::ownsEvent"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7DeleterE", "tensorrt_llm::runtime::CudaEvent::Deleter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEb", "tensorrt_llm::runtime::CudaEvent::Deleter::Deleter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEv", "tensorrt_llm::runtime::CudaEvent::Deleter::Deleter"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter7DeleterEb", "tensorrt_llm::runtime::CudaEvent::Deleter::Deleter::ownsEvent"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7Deleter10mOwnsEventE", "tensorrt_llm::runtime::CudaEvent::Deleter::mOwnsEvent"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent7DeleterclE7pointer", "tensorrt_llm::runtime::CudaEvent::Deleter::operator()"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent7DeleterclE7pointer", "tensorrt_llm::runtime::CudaEvent::Deleter::operator()::event"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent8EventPtrE", "tensorrt_llm::runtime::CudaEvent::EventPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent12element_typeE", "tensorrt_llm::runtime::CudaEvent::element_type"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent3getEv", "tensorrt_llm::runtime::CudaEvent::get"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent6mEventE", "tensorrt_llm::runtime::CudaEvent::mEvent"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaEvent::pointer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9CudaEvent11synchronizeEv", "tensorrt_llm::runtime::CudaEvent::synchronize"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStreamE", "tensorrt_llm::runtime::CudaStream"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_t", "tensorrt_llm::runtime::CudaStream::CudaStream"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", "tensorrt_llm::runtime::CudaStream::CudaStream"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamEji", "tensorrt_llm::runtime::CudaStream::CudaStream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", "tensorrt_llm::runtime::CudaStream::CudaStream::device"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamEji", "tensorrt_llm::runtime::CudaStream::CudaStream::flags"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", "tensorrt_llm::runtime::CudaStream::CudaStream::ownsStream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamEji", "tensorrt_llm::runtime::CudaStream::CudaStream::priority"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_t", "tensorrt_llm::runtime::CudaStream::CudaStream::stream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream10CudaStreamE12cudaStream_tib", "tensorrt_llm::runtime::CudaStream::CudaStream::stream"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7DeleterE", "tensorrt_llm::runtime::CudaStream::Deleter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEb", "tensorrt_llm::runtime::CudaStream::Deleter::Deleter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEv", "tensorrt_llm::runtime::CudaStream::Deleter::Deleter"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter7DeleterEb", "tensorrt_llm::runtime::CudaStream::Deleter::Deleter::ownsStream"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7Deleter11mOwnsStreamE", "tensorrt_llm::runtime::CudaStream::Deleter::mOwnsStream"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream7DeleterclE12cudaStream_t", "tensorrt_llm::runtime::CudaStream::Deleter::operator()"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream7DeleterclE12cudaStream_t", "tensorrt_llm::runtime::CudaStream::Deleter::operator()::stream"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream9StreamPtrE", "tensorrt_llm::runtime::CudaStream::StreamPtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream3getEv", "tensorrt_llm::runtime::CudaStream::get"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream9getDeviceEv", "tensorrt_llm::runtime::CudaStream::getDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7mDeviceE", "tensorrt_llm::runtime::CudaStream::mDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10CudaStream7mStreamE", "tensorrt_llm::runtime::CudaStream::mStream"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordEN9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaStream::record"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordERK9CudaEvent", "tensorrt_llm::runtime::CudaStream::record"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordEN9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaStream::record::event"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream6recordERK9CudaEvent", "tensorrt_llm::runtime::CudaStream::record::event"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream11synchronizeEv", "tensorrt_llm::runtime::CudaStream::synchronize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitEN9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaStream::wait"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitERK9CudaEvent", "tensorrt_llm::runtime::CudaStream::wait"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitEN9CudaEvent7pointerE", "tensorrt_llm::runtime::CudaStream::wait::event"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10CudaStream4waitERK9CudaEvent", "tensorrt_llm::runtime::CudaStream::wait::event"], [1, 2, 1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", "tensorrt_llm::runtime::DataTypeTraits"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", "tensorrt_llm::runtime::DataTypeTraits::kDataType"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", "tensorrt_llm::runtime::DataTypeTraits::kIsPointer"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_b_bEN12tensorrt_llm7runtime14DataTypeTraitsE", "tensorrt_llm::runtime::DataTypeTraits::kIsUnsigned"], [1, 2, 1, "_CPPv4I_N8nvinfer18DataTypeE_bEN12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_bEN12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::kDataType"], [1, 8, 1, "_CPPv4I_N8nvinfer18DataTypeE_bEN12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::kUnsigned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsI9kDataType9kUnsignedXL1EEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;kDataType, kUnsigned, true&gt;::type"], [1, 2, 1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;"], [1, 8, 1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;::kUnsigned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kBOOLE9kUnsignedE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kBOOL, kUnsigned&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kFLOAT&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kFLOAT&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kFLOAT&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kFLOATEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kFLOAT&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kHALF&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kHALF&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kHALF&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kHALFEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kHALF&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32, true&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32, true&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32, true&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EXL1EEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32, true&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT32EE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT32&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EXL1EEE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64, true&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kINT64EE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT64&gt;::type"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT8&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT8&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT8&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType5kINT8EE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kINT8&gt;::type"], [1, 2, 1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;"], [1, 8, 1, "_CPPv4I_bEN12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedEE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;::kUnsigned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4nameE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;::name"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4sizeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;::size"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DataTypeTraitsIN8nvinfer18DataType6kUINT8E9kUnsignedE4typeE", "tensorrt_llm::runtime::DataTypeTraits&lt;nvinfer1::DataType::kUINT8, kUnsigned&gt;::type"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInputE", "tensorrt_llm::runtime::DecodingInput"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::endIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::logits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::maxAttentionWindow"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::maxLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13DecodingInputE10SizeType3210SizeType3210SizeType3210SizeType3214TensorConstPtr9TensorPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::DecodingInput::sinkTokenLength"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::acceptedLens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::acceptedPathIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::acceptedTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::chunkedContextNextTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::lastDraftLens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::lastDraftPaths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::lastDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::nextDraftLens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::nextDraftPaths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::nextDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs11EagleInputsE14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr14TensorConstPtr", "tensorrt_llm::runtime::DecodingInput::EagleInputs::EagleInputs::seqSlots"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs12acceptedLensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::acceptedLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15acceptedPathIdsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::acceptedPathIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14acceptedTokensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::acceptedTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs24chunkedContextNextTokensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::chunkedContextNextTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs13lastDraftLensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::lastDraftLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14lastDraftPathsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::lastDraftPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15lastDraftTokensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::lastDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs13nextDraftLensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::nextDraftLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs14nextDraftPathsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::nextDraftPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs15nextDraftTokensE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::nextDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11EagleInputs8seqSlotsE", "tensorrt_llm::runtime::DecodingInput::EagleInputs::seqSlots"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15bestPathIndicesE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::bestPathIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15bestPathLengthsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::bestPathLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs16lastDraftIndicesE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::lastDraftIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15lastDraftTokensE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::lastDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs21lastGenerationLengthsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::lastGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs19lastPositionIdsBaseE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::lastPositionIdsBase"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs5masksE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::masks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs18maxGenLengthDeviceE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::maxGenLengthDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs16nextDraftIndicesE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextDraftIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs14nextDraftProbsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextDraftProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs15nextDraftTokensE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs14nextFlatTokensE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextFlatTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs21nextGenerationLengthsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::nextGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs17packedPositionIdsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::packedPositionIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExplicitDraftTokensInputs8seqSlotsE", "tensorrt_llm::runtime::DecodingInput::ExplicitDraftTokensInputs::seqSlots"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs17constantThresholdE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::constantThreshold"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs11draftLogitsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::draftLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs10draftProbsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::draftProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs13draftTokenIdsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::draftTokenIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs14numDraftTokensE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::numDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs18numDraftTokensHostE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::numDraftTokensHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs4stepE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::step"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs11targetProbsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::targetProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs14useDraftLogitsE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::useDraftLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs18useDraftLogitsHostE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::useDraftLogitsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25ExternalDraftTokensInputs28useRandomAcceptanceThresholdE", "tensorrt_llm::runtime::DecodingInput::ExternalDraftTokensInputs::useRandomAcceptanceThreshold"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15LookaheadInputsE", "tensorrt_llm::runtime::DecodingInput::LookaheadInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15LookaheadInputs13tokensPerStepE", "tensorrt_llm::runtime::DecodingInput::LookaheadInputs::tokensPerStep"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputsE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs22medusaCurTokensPerStepE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaCurTokensPerStep"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs12medusaLogitsE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs11medusaPathsE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs25medusaTargetTokensPerStepE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaTargetTokensPerStep"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12MedusaInputs13medusaTreeIdsE", "tensorrt_llm::runtime::DecodingInput::MedusaInputs::medusaTreeIds"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14TensorConstPtrE", "tensorrt_llm::runtime::DecodingInput::TensorConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9TensorPtrE", "tensorrt_llm::runtime::DecodingInput::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12badWordsLensE", "tensorrt_llm::runtime::DecodingInput::badWordsLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13badWordsListsE", "tensorrt_llm::runtime::DecodingInput::badWordsLists"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12badWordsPtrsE", "tensorrt_llm::runtime::DecodingInput::badWordsPtrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9batchSizeE", "tensorrt_llm::runtime::DecodingInput::batchSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput10batchSlotsE", "tensorrt_llm::runtime::DecodingInput::batchSlots"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput10beamWidthsE", "tensorrt_llm::runtime::DecodingInput::beamWidths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput16cacheIndirectionE", "tensorrt_llm::runtime::DecodingInput::cacheIndirection"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput11eagleInputsE", "tensorrt_llm::runtime::DecodingInput::eagleInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13embeddingBiasE", "tensorrt_llm::runtime::DecodingInput::embeddingBias"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput6endIdsE", "tensorrt_llm::runtime::DecodingInput::endIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25explicitDraftTokensInputsE", "tensorrt_llm::runtime::DecodingInput::explicitDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput25externalDraftTokensInputsE", "tensorrt_llm::runtime::DecodingInput::externalDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13finishReasonsE", "tensorrt_llm::runtime::DecodingInput::finishReasons"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15generationStepsE", "tensorrt_llm::runtime::DecodingInput::generationSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput7lengthsE", "tensorrt_llm::runtime::DecodingInput::lengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput6logitsE", "tensorrt_llm::runtime::DecodingInput::logits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9logitsVecE", "tensorrt_llm::runtime::DecodingInput::logitsVec"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15lookaheadInputsE", "tensorrt_llm::runtime::DecodingInput::lookaheadInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput18maxAttentionWindowE", "tensorrt_llm::runtime::DecodingInput::maxAttentionWindow"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14maxBadWordsLenE", "tensorrt_llm::runtime::DecodingInput::maxBadWordsLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput9maxLengthE", "tensorrt_llm::runtime::DecodingInput::maxLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15maxStopWordsLenE", "tensorrt_llm::runtime::DecodingInput::maxStopWordsLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput12medusaInputsE", "tensorrt_llm::runtime::DecodingInput::medusaInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput17noRepeatNgramSizeE", "tensorrt_llm::runtime::DecodingInput::noRepeatNgramSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput19sequenceLimitLengthE", "tensorrt_llm::runtime::DecodingInput::sequenceLimitLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput15sinkTokenLengthE", "tensorrt_llm::runtime::DecodingInput::sinkTokenLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput4stepE", "tensorrt_llm::runtime::DecodingInput::step"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13stopWordsLensE", "tensorrt_llm::runtime::DecodingInput::stopWordsLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput14stopWordsListsE", "tensorrt_llm::runtime::DecodingInput::stopWordsLists"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13DecodingInput13stopWordsPtrsE", "tensorrt_llm::runtime::DecodingInput::stopWordsPtrs"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutputE", "tensorrt_llm::runtime::DecodingOutput"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypothesesE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses10batchDonesE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::batchDones"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses14cumLogProbsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::cumLogProbsCBA"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5emptyERK13BufferManager", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::empty"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5emptyERK13BufferManager", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::empty::manager"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses4initERK13BufferManager11TokenIdType", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::init"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses4initERK13BufferManager11TokenIdType", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::init::endId"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses4initERK13BufferManager11TokenIdType", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::init::manager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses11logProbsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::logProbsCBA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses18minNormedScoresCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::minNormedScoresCBA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses15normedScoresCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::normedScoresCBA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses11numBeamsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::numBeamsCBA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses12outputIdsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::outputIdsCBA"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7releaseEv", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::release"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape::beamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::reshape::maxSequenceLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses18sequenceLengthsCBAE", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::sequenceLengthsCBA"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5sliceE10SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::slice"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5sliceE10SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::slice::batchIndex"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime14DecodingOutput14BeamHypotheses5sliceE10SizeType3210SizeType32", "tensorrt_llm::runtime::DecodingOutput::BeamHypotheses::slice::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14DecodingOutputE9TensorPtr9TensorPtr", "tensorrt_llm::runtime::DecodingOutput::DecodingOutput"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14DecodingOutputE9TensorPtr9TensorPtr", "tensorrt_llm::runtime::DecodingOutput::DecodingOutput::gatheredIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14DecodingOutputE9TensorPtr9TensorPtr", "tensorrt_llm::runtime::DecodingOutput::DecodingOutput::ids"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputsE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs21acceptedLengthsCumSumE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::acceptedLengthsCumSum"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs17acceptedTokensLenE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::acceptedTokensLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs15nextDraftTokensE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::nextDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs18nextDraftTokensLenE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::nextDraftTokensLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs12pathsOffsetsE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::pathsOffsets"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26SpeculativeDecodingOutputs18prevDraftTokensLenE", "tensorrt_llm::runtime::DecodingOutput::SpeculativeDecodingOutputs::prevDraftTokensLen"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9TensorPtrE", "tensorrt_llm::runtime::DecodingOutput::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14beamHypothesesE", "tensorrt_llm::runtime::DecodingOutput::beamHypotheses"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput16cacheIndirectionE", "tensorrt_llm::runtime::DecodingOutput::cacheIndirection"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11cumLogProbsE", "tensorrt_llm::runtime::DecodingOutput::cumLogProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput12eagleBuffersE", "tensorrt_llm::runtime::DecodingOutput::eagleBuffers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26explicitDraftTokensBuffersE", "tensorrt_llm::runtime::DecodingOutput::explicitDraftTokensBuffers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput13finishReasonsE", "tensorrt_llm::runtime::DecodingOutput::finishReasons"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11finishedSumE", "tensorrt_llm::runtime::DecodingOutput::finishedSum"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput11gatheredIdsE", "tensorrt_llm::runtime::DecodingOutput::gatheredIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput3idsE", "tensorrt_llm::runtime::DecodingOutput::ids"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput17kNegativeInfinityE", "tensorrt_llm::runtime::DecodingOutput::kNegativeInfinity"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput7lengthsE", "tensorrt_llm::runtime::DecodingOutput::lengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput8logProbsE", "tensorrt_llm::runtime::DecodingOutput::logProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput13logProbsTiledE", "tensorrt_llm::runtime::DecodingOutput::logProbsTiled"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput16lookaheadOutputsE", "tensorrt_llm::runtime::DecodingOutput::lookaheadOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9newTokensE", "tensorrt_llm::runtime::DecodingOutput::newTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput14newTokensStepsE", "tensorrt_llm::runtime::DecodingOutput::newTokensSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput12newTokensVecE", "tensorrt_llm::runtime::DecodingOutput::newTokensVec"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput9parentIdsE", "tensorrt_llm::runtime::DecodingOutput::parentIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14DecodingOutput26speculativeDecodingOutputsE", "tensorrt_llm::runtime::DecodingOutput::speculativeDecodingOutputs"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime20DeviceAllocationNvlsE", "tensorrt_llm::runtime::DeviceAllocationNvls"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls20DeviceAllocationNvlsEv", "tensorrt_llm::runtime::DeviceAllocationNvls::DeviceAllocationNvls"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime20DeviceAllocationNvlsE", "tensorrt_llm::runtime::DeviceAllocationNvls::T"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls9_capacityE", "tensorrt_llm::runtime::DeviceAllocationNvls::_capacity"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls7_handleE", "tensorrt_llm::runtime::DeviceAllocationNvls::_handle"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls4freeEv", "tensorrt_llm::runtime::DeviceAllocationNvls::free"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls11getCapacityEv", "tensorrt_llm::runtime::DeviceAllocationNvls::getCapacity"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls21getIpcUnicastPointersEv", "tensorrt_llm::runtime::DeviceAllocationNvls::getIpcUnicastPointers"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls19getMulticastPointerEv", "tensorrt_llm::runtime::DeviceAllocationNvls::getMulticastPointer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20DeviceAllocationNvls17getUnicastPointerEv", "tensorrt_llm::runtime::DeviceAllocationNvls::getUnicastPointer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls5resetE6size_tNSt3setIiEE", "tensorrt_llm::runtime::DeviceAllocationNvls::reset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls5resetE6size_tNSt3setIiEE", "tensorrt_llm::runtime::DeviceAllocationNvls::reset::ranks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvls5resetE6size_tNSt3setIiEE", "tensorrt_llm::runtime::DeviceAllocationNvls::reset::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20DeviceAllocationNvlsD0Ev", "tensorrt_llm::runtime::DeviceAllocationNvls::~DeviceAllocationNvls"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffersE", "tensorrt_llm::runtime::EagleBuffers"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9BufferPtrE", "tensorrt_llm::runtime::EagleBuffers::BufferPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::decodingConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12EagleBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN8executor14DecodingConfigE", "tensorrt_llm::runtime::EagleBuffers::EagleBuffers::worldConfig"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputsE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs12acceptedLensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::acceptedLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs13acceptedPathsE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::acceptedPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs14acceptedTokensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::acceptedTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs24chunkedContextNextTokensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::chunkedContextNextTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs13nextDraftLensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::nextDraftLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs14nextDraftPathsE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::nextDraftPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13EngineOutputs15nextDraftTokensE", "tensorrt_llm::runtime::EagleBuffers::EngineOutputs::nextDraftTokens"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7ITensorE", "tensorrt_llm::runtime::EagleBuffers::ITensor"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6InputsE", "tensorrt_llm::runtime::EagleBuffers::Inputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs22allLayersDraftTokenIdsE", "tensorrt_llm::runtime::EagleBuffers::Inputs::allLayersDraftTokenIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs33allLayersDraftTokenIdsPredecessorE", "tensorrt_llm::runtime::EagleBuffers::Inputs::allLayersDraftTokenIdsPredecessor"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs15allLayersScoresE", "tensorrt_llm::runtime::EagleBuffers::Inputs::allLayersScores"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs24chunkedContextNextTokensE", "tensorrt_llm::runtime::EagleBuffers::Inputs::chunkedContextNextTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create::maxNumSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs6createE10SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::EagleBuffers::Inputs::create::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs20currentExpandIndicesE", "tensorrt_llm::runtime::EagleBuffers::Inputs::currentExpandIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs9draftLensE", "tensorrt_llm::runtime::EagleBuffers::Inputs::draftLens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs10draftPathsE", "tensorrt_llm::runtime::EagleBuffers::Inputs::draftPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs14draftPathsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::draftPathsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs11draftTokensE", "tensorrt_llm::runtime::EagleBuffers::Inputs::draftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs22dynamicTreeMaxTopKHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::dynamicTreeMaxTopKHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29eagleNetCtxContextLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetCtxContextLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs34eagleNetCtxPastKeyValueLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetCtxPastKeyValueLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27eagleNetCtxRequestTypesHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetCtxRequestTypesHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29eagleNetGenContextLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetGenContextLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs34eagleNetGenPastKeyValueLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetGenPastKeyValueLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27eagleNetGenRequestTypesHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::eagleNetGenRequestTypesHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18inputGenTokensHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::inputGenTokensHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs14posteriorAlphaE", "tensorrt_llm::runtime::EagleBuffers::Inputs::posteriorAlpha"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18posteriorThresholdE", "tensorrt_llm::runtime::EagleBuffers::Inputs::posteriorThreshold"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs10prevScoresE", "tensorrt_llm::runtime::EagleBuffers::Inputs::prevScores"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs16randomDataSampleE", "tensorrt_llm::runtime::EagleBuffers::Inputs::randomDataSample"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs20randomDataValidationE", "tensorrt_llm::runtime::EagleBuffers::Inputs::randomDataValidation"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs29specDecodingGenerationLengthsE", "tensorrt_llm::runtime::EagleBuffers::Inputs::specDecodingGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs33specDecodingGenerationLengthsHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::specDecodingGenerationLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs23specDecodingPackedMasksE", "tensorrt_llm::runtime::EagleBuffers::Inputs::specDecodingPackedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs27specDecodingPositionOffsetsE", "tensorrt_llm::runtime::EagleBuffers::Inputs::specDecodingPositionOffsets"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs12temperaturesE", "tensorrt_llm::runtime::EagleBuffers::Inputs::temperatures"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs18useDynamicTreeHostE", "tensorrt_llm::runtime::EagleBuffers::Inputs::useDynamicTreeHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers6Inputs15useSpecDecodingE", "tensorrt_llm::runtime::EagleBuffers::Inputs::useSpecDecoding"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13LlmRequestPtrE", "tensorrt_llm::runtime::EagleBuffers::LlmRequestPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13RequestVectorE", "tensorrt_llm::runtime::EagleBuffers::RequestVector"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers10SizeType32E", "tensorrt_llm::runtime::EagleBuffers::SizeType32"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9TensorMapE", "tensorrt_llm::runtime::EagleBuffers::TensorMap"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers9TensorPtrE", "tensorrt_llm::runtime::EagleBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers28chunkedContextNextTokensHostE", "tensorrt_llm::runtime::EagleBuffers::chunkedContextNextTokensHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers23cumSumGenerationLengthsE", "tensorrt_llm::runtime::EagleBuffers::cumSumGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers12engineInputsE", "tensorrt_llm::runtime::EagleBuffers::engineInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers13engineOutputsE", "tensorrt_llm::runtime::EagleBuffers::engineOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers18greedySamplingHostE", "tensorrt_llm::runtime::EagleBuffers::greedySamplingHost"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::insertInputTensors"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::insertInputTensors::inputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::insertInputTensors::outputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::insertInputTensors::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers26mDefaultPosteriorThresholdE", "tensorrt_llm::runtime::EagleBuffers::mDefaultPosteriorThreshold"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers17mDoGreedySamplingE", "tensorrt_llm::runtime::EagleBuffers::mDoGreedySampling"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers19maxGenerationLengthE", "tensorrt_llm::runtime::EagleBuffers::maxGenerationLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers18posteriorAlphaHostE", "tensorrt_llm::runtime::EagleBuffers::posteriorAlphaHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers22posteriorThresholdHostE", "tensorrt_llm::runtime::EagleBuffers::posteriorThresholdHost"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::EagleBuffers::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::EagleBuffers::reshape::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::EagleBuffers::reshape::numCtxSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::EagleBuffers::reshape::numGenSequences"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers21scanReduceTempStorageE", "tensorrt_llm::runtime::EagleBuffers::scanReduceTempStorage"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12EagleBuffers26scanReduceTempStorageBytesE", "tensorrt_llm::runtime::EagleBuffers::scanReduceTempStorageBytes"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::T"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::contextRequests"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::contextRequests"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::decoderBuffers"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::draftBuffers"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::eagleModule"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::genRequests"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::genRequests"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::manager"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::manager"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::modelConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::requestTypes"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime12EagleBuffers13setFromInputsEvRK13RequestVectorRK13RequestVector10SizeType32RK7ITensorRKN12EagleBuffers6InputsERKN7runtime11EagleModuleERKN7runtime13BufferManagerE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::vocabSizePadded"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime12EagleBuffers13setFromInputsERK13RequestVectorRK13RequestVectorRKN7runtime7ITensorERK7ITensorRKN12EagleBuffers6InputsERKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::EagleBuffers::setFromInputs::worldConfig"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModuleE", "tensorrt_llm::runtime::EagleModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleEv", "tensorrt_llm::runtime::EagleModule::EagleModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule::maxDecodingDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule::maxDraftPathLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule::maxNonLeafNodesPerLayer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule11EagleModuleE10SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::EagleModule::EagleModule::numTransformersLayer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule22getDefaultEagleChoicesEv", "tensorrt_llm::runtime::EagleModule::getDefaultEagleChoices"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule26getMaxNonLeafNodesPerLayerEv", "tensorrt_llm::runtime::EagleModule::getMaxNonLeafNodesPerLayer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11EagleModule23getNumTransformerLayersEv", "tensorrt_llm::runtime::EagleModule::getNumTransformerLayers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule20mDefaultEagleChoicesE", "tensorrt_llm::runtime::EagleModule::mDefaultEagleChoices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule24mMaxNonLeafNodesPerLayerE", "tensorrt_llm::runtime::EagleModule::mMaxNonLeafNodesPerLayer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11EagleModule21mNumTransformersLayerE", "tensorrt_llm::runtime::EagleModule::mNumTransformersLayer"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffersE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9BufferPtrE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::BufferPtr"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputs15positionOffsetsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs::positionOffsets"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12EngineInputs18requestTypesDeviceE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineInputs::requestTypesDevice"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15bestPathIndicesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::bestPathIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15bestPathLengthsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::bestPathLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs5masksE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::masks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs11maxGenTokenE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::maxGenToken"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs16nextDraftIndicesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextDraftIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs14nextDraftProbsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextDraftProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs15nextDraftTokensE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs14nextFlatTokensE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextFlatTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs21nextGenerationLengthsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs19nextPositionOffsetsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::nextPositionOffsets"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs17packedPositionIdsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::packedPositionIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13EngineOutputs13totalGenTokenE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::EngineOutputs::totalGenToken"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers26ExplicitDraftTokensBuffersE10SizeType3210SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ExplicitDraftTokensBuffers::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7ITensorE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::ITensor"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6InputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create::maxNumSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs6createE10SizeType32RKN7runtime13BufferManagerERKN7runtime11ModelConfigERKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::create::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs12draftIndicesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::draftIndices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs10draftProbsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::draftProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11draftTokensE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::draftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs17generationLengthsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::generationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs21generationLengthsHostE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::generationLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs16maxGenLengthHostE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::maxGenLengthHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11packedMasksE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::packedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs11positionIdsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::positionIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs15positionIdsBaseE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::positionIdsBase"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs16randomDataSampleE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::randomDataSample"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs20randomDataValidationE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::randomDataValidation"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs12temperaturesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::temperatures"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers6Inputs15useSpecDecodingE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::Inputs::useSpecDecoding"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers10SizeType32E", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::SizeType32"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9TensorMapE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::TensorMap"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers9TensorPtrE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers23cumSumGenerationLengthsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::cumSumGenerationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers12engineInputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::engineInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13engineOutputsE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::engineOutputs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::insertInputTensors"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::insertInputTensors::inputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::insertInputTensors::outputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers18insertInputTensorsER9TensorMapR9TensorMapRKN7runtime11WorldConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::insertInputTensors::worldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::reshape::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::reshape::numCtxSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers7reshapeE10SizeType3210SizeType32RKN7runtime11ModelConfigE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::reshape::numGenSequences"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers15scanTempStorageE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::scanTempStorage"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26ExplicitDraftTokensBuffers20scanTempStorageBytesE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::scanTempStorageBytes"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::T"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::contextPositionIds"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::contextPositionIds"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::decoderBuffers"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::draftBuffers"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::explicitDraftTokensModule"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::manager"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::modelConfig"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::numCtxSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::numCtxSequences"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::numGenSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::numGenSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::requestTypes"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::stream"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::stream"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsEv10SizeType3210SizeType3210SizeType32RK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime25ExplicitDraftTokensModuleERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::vocabSizePadded"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime26ExplicitDraftTokensBuffers13setFromInputsE10SizeType3210SizeType32RKN7runtime7ITensorERK7ITensorRKN26ExplicitDraftTokensBuffers6InputsERK7ITensorRKN7runtime11ModelConfigERKN7runtime11WorldConfigERKN7runtime13BufferManagerERKN7runtime10CudaStreamE", "tensorrt_llm::runtime::ExplicitDraftTokensBuffers::setFromInputs::worldConfig"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime25GenericPromptTuningParamsE", "tensorrt_llm::runtime::GenericPromptTuningParams"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::GenericPromptTuningParams::GenericPromptTuningParams"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::GenericPromptTuningParams::GenericPromptTuningParams::embeddingTable"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::GenericPromptTuningParams::GenericPromptTuningParams::tasks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams25GenericPromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::GenericPromptTuningParams::GenericPromptTuningParams::vocabSize"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams10SizeType32E", "tensorrt_llm::runtime::GenericPromptTuningParams::SizeType32"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime25GenericPromptTuningParamsE", "tensorrt_llm::runtime::GenericPromptTuningParams::TTensor"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams9TensorPtrE", "tensorrt_llm::runtime::GenericPromptTuningParams::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams14embeddingTableE", "tensorrt_llm::runtime::GenericPromptTuningParams::embeddingTable"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams19promptTuningEnabledE", "tensorrt_llm::runtime::GenericPromptTuningParams::promptTuningEnabled"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams5tasksE", "tensorrt_llm::runtime::GenericPromptTuningParams::tasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25GenericPromptTuningParams9vocabSizeE", "tensorrt_llm::runtime::GenericPromptTuningParams::vocabSize"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime10GptDecoderE", "tensorrt_llm::runtime::GptDecoder"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13CudaStreamPtrE", "tensorrt_llm::runtime::GptDecoder::CudaStreamPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::mode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::speculativeDecodingModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::stream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::vocabSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10GptDecoderERKN8executor12DecodingModeE6size_t6size_t6size_t6size_t6size_tRK13CudaStreamPtrNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::GptDecoder::GptDecoder::vocabSizePadded"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime10GptDecoderE", "tensorrt_llm::runtime::GptDecoder::T"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder9TensorPtrE", "tensorrt_llm::runtime::GptDecoder::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::GptDecoder::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::GptDecoder::disableLookahead::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::GptDecoder::disableLookahead::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::GptDecoder::disableLookahead::samplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardAsync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardAsync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardAsync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardSync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardSync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::GptDecoder::forwardSync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder17getSamplingConfigEv", "tensorrt_llm::runtime::GptDecoder::getSamplingConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder23mDecodingLayerWorkspaceE", "tensorrt_llm::runtime::GptDecoder::mDecodingLayerWorkspace"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13mDecodingModeE", "tensorrt_llm::runtime::GptDecoder::mDecodingMode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder19mDynamicDecodeLayerE", "tensorrt_llm::runtime::GptDecoder::mDynamicDecodeLayer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder8mManagerE", "tensorrt_llm::runtime::GptDecoder::mManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder13mMaxBatchSizeE", "tensorrt_llm::runtime::GptDecoder::mMaxBatchSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder15mSamplingConfigE", "tensorrt_llm::runtime::GptDecoder::mSamplingConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder10mVocabSizeE", "tensorrt_llm::runtime::GptDecoder::mVocabSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder16mVocabSizePaddedE", "tensorrt_llm::runtime::GptDecoder::mVocabSizePadded"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::requests"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10GptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::GptDecoder::setup::samplingConfig"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatchedE", "tensorrt_llm::runtime::GptDecoderBatched"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13CudaStreamPtrE", "tensorrt_llm::runtime::GptDecoderBatched::CudaStreamPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched::speculativeDecodingMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched17GptDecoderBatchedE13CudaStreamPtrRK23SpeculativeDecodingModeN8nvinfer18DataTypeE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderBatched::stream"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13GptDecoderPtrE", "tensorrt_llm::runtime::GptDecoderBatched::GptDecoderPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13LlmRequestPtrE", "tensorrt_llm::runtime::GptDecoderBatched::LlmRequestPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13RequestVectorE", "tensorrt_llm::runtime::GptDecoderBatched::RequestVector"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14SharedConstPtrE", "tensorrt_llm::runtime::GptDecoderBatched::SharedConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched9TensorPtrE", "tensorrt_llm::runtime::GptDecoderBatched::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::GptDecoderBatched::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::GptDecoderBatched::disableLookahead::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::GptDecoderBatched::disableLookahead::genRequests"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize::batchSlot"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize::decoderState"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize::samplingConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::GptDecoderBatched::finalize::streaming"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forward"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forward::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forward::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardAsync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardAsync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardAsync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardDispatch"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardDispatch::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15forwardDispatchERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::forwardDispatch::output"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getBufferManagerEv", "tensorrt_llm::runtime::GptDecoderBatched::getBufferManager"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv", "tensorrt_llm::runtime::GptDecoderBatched::getDecoderState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched15getDecoderStateEv", "tensorrt_llm::runtime::GptDecoderBatched::getDecoderState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched16getDecoderStreamEv", "tensorrt_llm::runtime::GptDecoderBatched::getDecoderStream"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime17GptDecoderBatched20getUnderlyingDecoderEv", "tensorrt_llm::runtime::GptDecoderBatched::getUnderlyingDecoder"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mBufferManagerE", "tensorrt_llm::runtime::GptDecoderBatched::mBufferManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched8mDecoderE", "tensorrt_llm::runtime::GptDecoderBatched::mDecoder"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched13mDecoderStateE", "tensorrt_llm::runtime::GptDecoderBatched::mDecoderState"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mDecoderStreamE", "tensorrt_llm::runtime::GptDecoderBatched::mDecoderStream"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14mRuntimeStreamE", "tensorrt_llm::runtime::GptDecoderBatched::mRuntimeStream"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::prepareForward"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::prepareForward::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::prepareForward::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14prepareForwardE10SizeType32RN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::prepareForward::step"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::setEagleInputs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched14setEagleInputsERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::setEagleInputs::input"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::setExplicitDraftTokensInputs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched28setExplicitDraftTokensInputsERKN13decoder_batch5InputE", "tensorrt_llm::runtime::GptDecoderBatched::setExplicitDraftTokensInputs::input"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxAttentionWindow"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::maxTokensPerStep"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::mode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::sinkTokenLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17GptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::GptDecoderBatched::setup::worldConfig"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfigE", "tensorrt_llm::runtime::GptJsonConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::contextParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::gpusPerNode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::name"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::pipelineParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::precision"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::runtimeDefaults"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::tensorParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig13GptJsonConfigENSt6stringENSt6stringENSt6stringE10SizeType3210SizeType3210SizeType3210SizeType3211ModelConfigNSt8optionalI15RuntimeDefaultsEE", "tensorrt_llm::runtime::GptJsonConfig::GptJsonConfig::version"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfig", "tensorrt_llm::runtime::GptJsonConfig::engineFilename"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfigRKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::engineFilename"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfigRKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::engineFilename::model"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfig", "tensorrt_llm::runtime::GptJsonConfig::engineFilename::worldConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14engineFilenameERK11WorldConfigRKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::engineFilename::worldConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig21getContextParallelismEv", "tensorrt_llm::runtime::GptJsonConfig::getContextParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14getGpusPerNodeEv", "tensorrt_llm::runtime::GptJsonConfig::getGpusPerNode"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig14getModelConfigEv", "tensorrt_llm::runtime::GptJsonConfig::getModelConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig21getModelConfigMutableEv", "tensorrt_llm::runtime::GptJsonConfig::getModelConfigMutable"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig7getNameEv", "tensorrt_llm::runtime::GptJsonConfig::getName"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig22getPipelineParallelismEv", "tensorrt_llm::runtime::GptJsonConfig::getPipelineParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig12getPrecisionEv", "tensorrt_llm::runtime::GptJsonConfig::getPrecision"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig18getRuntimeDefaultsEv", "tensorrt_llm::runtime::GptJsonConfig::getRuntimeDefaults"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig20getTensorParallelismEv", "tensorrt_llm::runtime::GptJsonConfig::getTensorParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig10getVersionEv", "tensorrt_llm::runtime::GptJsonConfig::getVersion"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime13GptJsonConfig12getWorldSizeEv", "tensorrt_llm::runtime::GptJsonConfig::getWorldSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig19mContextParallelismE", "tensorrt_llm::runtime::GptJsonConfig::mContextParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig12mGpusPerNodeE", "tensorrt_llm::runtime::GptJsonConfig::mGpusPerNode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig12mModelConfigE", "tensorrt_llm::runtime::GptJsonConfig::mModelConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5mNameE", "tensorrt_llm::runtime::GptJsonConfig::mName"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig20mPipelineParallelismE", "tensorrt_llm::runtime::GptJsonConfig::mPipelineParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig10mPrecisionE", "tensorrt_llm::runtime::GptJsonConfig::mPrecision"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig16mRuntimeDefaultsE", "tensorrt_llm::runtime::GptJsonConfig::mRuntimeDefaults"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig18mTensorParallelismE", "tensorrt_llm::runtime::GptJsonConfig::mTensorParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig8mVersionE", "tensorrt_llm::runtime::GptJsonConfig::mVersion"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt10filesystem4pathE", "tensorrt_llm::runtime::GptJsonConfig::parse"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::parse"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERNSt7istreamE", "tensorrt_llm::runtime::GptJsonConfig::parse"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt6stringE", "tensorrt_llm::runtime::GptJsonConfig::parse::json"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERNSt7istreamE", "tensorrt_llm::runtime::GptJsonConfig::parse::json"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13GptJsonConfig5parseERKNSt10filesystem4pathE", "tensorrt_llm::runtime::GptJsonConfig::parse::path"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime7IBufferE", "tensorrt_llm::runtime::IBuffer"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer8DataTypeE", "tensorrt_llm::runtime::IBuffer::DataType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7IBufferERK7IBuffer", "tensorrt_llm::runtime::IBuffer::IBuffer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7IBufferEv", "tensorrt_llm::runtime::IBuffer::IBuffer"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer14SharedConstPtrE", "tensorrt_llm::runtime::IBuffer::SharedConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer9SharedPtrE", "tensorrt_llm::runtime::IBuffer::SharedPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer14UniqueConstPtrE", "tensorrt_llm::runtime::IBuffer::UniqueConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer9UniquePtrE", "tensorrt_llm::runtime::IBuffer::UniquePtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", "tensorrt_llm::runtime::IBuffer::data"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataEv", "tensorrt_llm::runtime::IBuffer::data"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", "tensorrt_llm::runtime::IBuffer::data"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataEv", "tensorrt_llm::runtime::IBuffer::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", "tensorrt_llm::runtime::IBuffer::data::index"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer4dataENSt6size_tE", "tensorrt_llm::runtime::IBuffer::data::index"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer11getCapacityEv", "tensorrt_llm::runtime::IBuffer::getCapacity"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer11getDataTypeEv", "tensorrt_llm::runtime::IBuffer::getDataType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer15getDataTypeNameE8DataType", "tensorrt_llm::runtime::IBuffer::getDataTypeName"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer15getDataTypeNameEv", "tensorrt_llm::runtime::IBuffer::getDataTypeName"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer15getDataTypeNameE8DataType", "tensorrt_llm::runtime::IBuffer::getDataTypeName::dataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer13getMemoryTypeEv", "tensorrt_llm::runtime::IBuffer::getMemoryType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer17getMemoryTypeNameEv", "tensorrt_llm::runtime::IBuffer::getMemoryTypeName"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7getSizeEv", "tensorrt_llm::runtime::IBuffer::getSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer14getSizeInBytesEv", "tensorrt_llm::runtime::IBuffer::getSizeInBytes"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer10memoryTypeEPKv", "tensorrt_llm::runtime::IBuffer::memoryType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer10memoryTypeEPKv", "tensorrt_llm::runtime::IBuffer::memoryType::data"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBufferaSERK7IBuffer", "tensorrt_llm::runtime::IBuffer::operator="], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer7releaseEv", "tensorrt_llm::runtime::IBuffer::release"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer6resizeENSt6size_tE", "tensorrt_llm::runtime::IBuffer::resize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer6resizeENSt6size_tE", "tensorrt_llm::runtime::IBuffer::resize::newSize"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::TConstPtr"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::buffer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::buffer"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::offset"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::offset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::offset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::offset"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::slice::tensor"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7toBytesENSt6size_tE", "tensorrt_llm::runtime::IBuffer::toBytes"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7IBuffer7toBytesENSt6size_tE", "tensorrt_llm::runtime::IBuffer::toBytes::size"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtr", "tensorrt_llm::runtime::IBuffer::view"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::TConstPtr"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7IBuffer4viewE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtr", "tensorrt_llm::runtime::IBuffer::view::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4viewE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::IBuffer::view::tensor"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrRNSt6vectorI1TEE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrRNSt6vectorI1TEE", "tensorrt_llm::runtime::IBuffer::wrap::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::capacity"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::capacity"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::data"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::data"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::size"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrP1TNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7IBuffer4wrapEPv8DataTypeNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::IBuffer::wrap::type"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7IBuffer4wrapE9UniquePtrRNSt6vectorI1TEE", "tensorrt_llm::runtime::IBuffer::wrap::v"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7IBufferD0Ev", "tensorrt_llm::runtime::IBuffer::~IBuffer"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoderE", "tensorrt_llm::runtime::IGptDecoder"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder14TensorConstPtrE", "tensorrt_llm::runtime::IGptDecoder::TensorConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder9TensorPtrE", "tensorrt_llm::runtime::IGptDecoder::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::mode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::speculativeDecodingModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::stream"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::vocabSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder6createERKN8executor12DecodingModeEN8nvinfer18DataTypeE6size_t6size_t6size_t6size_t6size_tRKN13BufferManager13CudaStreamPtrERKNSt10shared_ptrIK25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::IGptDecoder::create::vocabSizePadded"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::IGptDecoder::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::IGptDecoder::disableLookahead::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::IGptDecoder::disableLookahead::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder16disableLookaheadERKNSt8optionalI14SamplingConfigEE10SizeType3214TensorConstPtr", "tensorrt_llm::runtime::IGptDecoder::disableLookahead::samplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardAsync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardAsync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder12forwardAsyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardAsync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardSync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardSync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder11forwardSyncER14DecodingOutputRK13DecodingInput", "tensorrt_llm::runtime::IGptDecoder::forwardSync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder17getSamplingConfigEv", "tensorrt_llm::runtime::IGptDecoder::getSamplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::requests"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoder5setupERK14SamplingConfig6size_tRK14TensorConstPtrRKNSt8optionalI14DecodingOutputEERKNSt8optionalIKNSt6vectorIN13decoder_batch7RequestEEEEE", "tensorrt_llm::runtime::IGptDecoder::setup::samplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11IGptDecoderD0Ev", "tensorrt_llm::runtime::IGptDecoder::~IGptDecoder"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatchedE", "tensorrt_llm::runtime::IGptDecoderBatched"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13CudaStreamPtrE", "tensorrt_llm::runtime::IGptDecoderBatched::CudaStreamPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched18IGptDecoderBatchedEv", "tensorrt_llm::runtime::IGptDecoderBatched::IGptDecoderBatched"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13LlmRequestPtrE", "tensorrt_llm::runtime::IGptDecoderBatched::LlmRequestPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched13RequestVectorE", "tensorrt_llm::runtime::IGptDecoderBatched::RequestVector"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched9TensorPtrE", "tensorrt_llm::runtime::IGptDecoderBatched::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::IGptDecoderBatched::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::IGptDecoderBatched::disableLookahead::batchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched16disableLookaheadERK13RequestVectorRK9TensorPtr", "tensorrt_llm::runtime::IGptDecoderBatched::disableLookahead::genRequests"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize::batchSlot"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize::decoderState"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize::samplingConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime18IGptDecoderBatched8finalizeERKN7decoder12DecoderStateE10SizeType32RK14SamplingConfigb", "tensorrt_llm::runtime::IGptDecoderBatched::finalize::streaming"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forward"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forward::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched7forwardERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forward::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forwardAsync"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forwardAsync::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched12forwardAsyncERN13decoder_batch6OutputERKN13decoder_batch5InputE", "tensorrt_llm::runtime::IGptDecoderBatched::forwardAsync::output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxAttentionWindow"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::maxTokensPerStep"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::mode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::sinkTokenLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatched5setupERKN8executor12DecodingModeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeERK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::IGptDecoderBatched::setup::worldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18IGptDecoderBatchedD0Ev", "tensorrt_llm::runtime::IGptDecoderBatched::~IGptDecoderBatched"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime7ITensorE", "tensorrt_llm::runtime::ITensor"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9DimType64E", "tensorrt_llm::runtime::ITensor::DimType64"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7ITensorERK7ITensor", "tensorrt_llm::runtime::ITensor::ITensor"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7ITensorEv", "tensorrt_llm::runtime::ITensor::ITensor"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5ShapeE", "tensorrt_llm::runtime::ITensor::Shape"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor14SharedConstPtrE", "tensorrt_llm::runtime::ITensor::SharedConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9SharedPtrE", "tensorrt_llm::runtime::ITensor::SharedPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9TensorMapE", "tensorrt_llm::runtime::ITensor::TensorMap"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor14UniqueConstPtrE", "tensorrt_llm::runtime::ITensor::UniqueConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9UniquePtrE", "tensorrt_llm::runtime::ITensor::UniquePtr"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::TConstPtr"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor2atEN7ITensor14UniqueConstPtrERR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::at::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor2atE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::at::tensor"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8castSizeE6size_t", "tensorrt_llm::runtime::ITensor::castSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8castSizeE6size_t", "tensorrt_llm::runtime::ITensor::castSize::newSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8flattenNE9SharedPtrNSt7int64_tE", "tensorrt_llm::runtime::ITensor::flattenN"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8flattenNE9SharedPtrNSt7int64_tE", "tensorrt_llm::runtime::ITensor::flattenN::sliceN"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8flattenNE9SharedPtrNSt7int64_tE", "tensorrt_llm::runtime::ITensor::flattenN::tensor"], [1, 3, 1, "_CPPv4I_10SizeType32ENK12tensorrt_llm7runtime7ITensor12getDimensionE9DimType64v", "tensorrt_llm::runtime::ITensor::getDimension"], [1, 8, 1, "_CPPv4I_10SizeType32ENK12tensorrt_llm7runtime7ITensor12getDimensionE9DimType64v", "tensorrt_llm::runtime::ITensor::getDimension::n"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor8getShapeEv", "tensorrt_llm::runtime::ITensor::getShape"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9makeShapeERKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::makeShape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9makeShapeERKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::makeShape::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensoraSERK7ITensor", "tensorrt_llm::runtime::ITensor::operator="], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7reshapeERK5Shape", "tensorrt_llm::runtime::ITensor::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7reshapeERK5Shape", "tensorrt_llm::runtime::ITensor::reshape::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor6resizeENSt6size_tE", "tensorrt_llm::runtime::ITensor::resize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor6resizeENSt6size_tE", "tensorrt_llm::runtime::ITensor::resize::newSize"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 3, 1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor11shapeEqualsERK5ShapeRK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERKNSt16initializer_listI10SizeType32EE", "tensorrt_llm::runtime::ITensor::shapeEquals"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::T"], [1, 8, 1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::count"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::count"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::dims"], [1, 4, 1, "_CPPv4I0ENK12tensorrt_llm7runtime7ITensor11shapeEqualsEbPK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::dims"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor11shapeEqualsEbRK5ShapePK1T10SizeType32", "tensorrt_llm::runtime::ITensor::shapeEquals::lhs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor11shapeEqualsERK5ShapeRK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals::lhs"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals::other"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7ITensor11shapeEqualsERKNSt16initializer_listI10SizeType32EE", "tensorrt_llm::runtime::ITensor::shapeEquals::other"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor11shapeEqualsERK5ShapeRK5Shape", "tensorrt_llm::runtime::ITensor::shapeEquals::rhs"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", "tensorrt_llm::runtime::ITensor::slice"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::TConstPtr"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offset"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offset"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offset"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", "tensorrt_llm::runtime::ITensor::slice::offsetDims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", "tensorrt_llm::runtime::ITensor::slice::size"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor5sliceE14UniqueConstPtrRR9TConstPtrRKNSt16initializer_listI9DimType64EENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrNSt6size_tENSt6size_tE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRK5Shape9DimType64", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor5sliceE9SharedPtrRKNSt16initializer_listI9DimType64EE9DimType64", "tensorrt_llm::runtime::ITensor::slice::tensor"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeE10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeE10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze::dim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze::dim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7squeezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::squeeze::shape"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7stridesERK5Shape", "tensorrt_llm::runtime::ITensor::strides"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor7stridesERK5Shape", "tensorrt_llm::runtime::ITensor::strides::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8toStringERK5Shape", "tensorrt_llm::runtime::ITensor::toString"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor8toStringERK5Shape", "tensorrt_llm::runtime::ITensor::toString::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeE10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeE10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze::dim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze::dim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor9unsqueezeERK5Shape10SizeType32", "tensorrt_llm::runtime::ITensor::unsqueeze::shape"], [1, 3, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::view"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewE9SharedPtr", "tensorrt_llm::runtime::ITensor::view"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewEN7IBuffer9SharedPtrERK5Shape", "tensorrt_llm::runtime::ITensor::view"], [1, 8, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::view::TConstPtr"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewEN7IBuffer9SharedPtrERK5Shape", "tensorrt_llm::runtime::ITensor::view::buffer"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::view::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewEN7IBuffer9SharedPtrERK5Shape", "tensorrt_llm::runtime::ITensor::view::dims"], [1, 4, 1, "_CPPv4I0_NSt11enable_if_tINSt10is_const_vI18PointerElementTypeI9TConstPtrEEEiEEEN12tensorrt_llm7runtime7ITensor4viewE14UniqueConstPtrRR9TConstPtrRK5Shape", "tensorrt_llm::runtime::ITensor::view::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4viewE9SharedPtr", "tensorrt_llm::runtime::ITensor::view::tensor"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor6volumeERK5Shape", "tensorrt_llm::runtime::ITensor::volume"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor6volumeERK5Shape", "tensorrt_llm::runtime::ITensor::volume::dims"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor17volumeNonNegativeERK5Shape", "tensorrt_llm::runtime::ITensor::volumeNonNegative"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor17volumeNonNegativeERK5Shape", "tensorrt_llm::runtime::ITensor::volumeNonNegative::shape"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", "tensorrt_llm::runtime::ITensor::wrap"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", "tensorrt_llm::runtime::ITensor::wrap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", "tensorrt_llm::runtime::ITensor::wrap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", "tensorrt_llm::runtime::ITensor::wrap::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::capacity"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::capacity"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", "tensorrt_llm::runtime::ITensor::wrap::data"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::data"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::data"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5Shape", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrP1TRK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::shape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::type"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7ITensor4wrapEPvN8nvinfer18DataTypeERK5ShapeNSt6size_tE", "tensorrt_llm::runtime::ITensor::wrap::type"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime7ITensor4wrapE9UniquePtrRNSt6vectorI1TEERK5Shape", "tensorrt_llm::runtime::ITensor::wrap::v"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7ITensorD0Ev", "tensorrt_llm::runtime::ITensor::~ITensor"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryE", "tensorrt_llm::runtime::IpcMemory"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9BufferPtrE", "tensorrt_llm::runtime::IpcMemory::BufferPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory10FLAGS_SIZEE", "tensorrt_llm::runtime::IpcMemory::FLAGS_SIZE"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryERK9IpcMemory", "tensorrt_llm::runtime::IpcMemory::IpcMemory"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryERR9IpcMemory", "tensorrt_llm::runtime::IpcMemory::IpcMemory"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory::bufferSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory::openIpc"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9IpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfigb", "tensorrt_llm::runtime::IpcMemory::IpcMemory::worldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", "tensorrt_llm::runtime::IpcMemory::allocateIpcMemory"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", "tensorrt_llm::runtime::IpcMemory::allocateIpcMemory::bufferSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", "tensorrt_llm::runtime::IpcMemory::allocateIpcMemory::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory17allocateIpcMemoryENSt6size_tERK13BufferManagerRK11WorldConfig", "tensorrt_llm::runtime::IpcMemory::allocateIpcMemory::worldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory16destroyIpcMemoryEv", "tensorrt_llm::runtime::IpcMemory::destroyIpcMemory"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9IpcMemory11getCommPtrsEv", "tensorrt_llm::runtime::IpcMemory::getCommPtrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory7mBufferE", "tensorrt_llm::runtime::IpcMemory::mBuffer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory9mCommPtrsE", "tensorrt_llm::runtime::IpcMemory::mCommPtrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory8mOpenIpcE", "tensorrt_llm::runtime::IpcMemory::mOpenIpc"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemory7mTpRankE", "tensorrt_llm::runtime::IpcMemory::mTpRank"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryaSERK9IpcMemory", "tensorrt_llm::runtime::IpcMemory::operator="], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryaSERR9IpcMemory", "tensorrt_llm::runtime::IpcMemory::operator="], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9IpcMemoryD0Ev", "tensorrt_llm::runtime::IpcMemory::~IpcMemory"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandleE", "tensorrt_llm::runtime::IpcNvlsHandle"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle14ipc_uc_handlesE", "tensorrt_llm::runtime::IpcNvlsHandle::ipc_uc_handles"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle11ipc_uc_ptrsE", "tensorrt_llm::runtime::IpcNvlsHandle::ipc_uc_ptrs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle10ipc_uc_vasE", "tensorrt_llm::runtime::IpcNvlsHandle::ipc_uc_vas"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle9mc_handleE", "tensorrt_llm::runtime::IpcNvlsHandle::mc_handle"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle6mc_ptrE", "tensorrt_llm::runtime::IpcNvlsHandle::mc_ptr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle5mc_vaE", "tensorrt_llm::runtime::IpcNvlsHandle::mc_va"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle4sizeE", "tensorrt_llm::runtime::IpcNvlsHandle::size"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle9uc_handleE", "tensorrt_llm::runtime::IpcNvlsHandle::uc_handle"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle6uc_ptrE", "tensorrt_llm::runtime::IpcNvlsHandle::uc_ptr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13IpcNvlsHandle5uc_vaE", "tensorrt_llm::runtime::IpcNvlsHandle::uc_va"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffersE", "tensorrt_llm::runtime::LookaheadDecodingBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", "tensorrt_llm::runtime::LookaheadDecodingBuffers::LookaheadDecodingBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", "tensorrt_llm::runtime::LookaheadDecodingBuffers::LookaheadDecodingBuffers::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", "tensorrt_llm::runtime::LookaheadDecodingBuffers::LookaheadDecodingBuffers::maxNumSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers24LookaheadDecodingBuffersE10SizeType3210SizeType32RK13BufferManager", "tensorrt_llm::runtime::LookaheadDecodingBuffers::LookaheadDecodingBuffers::maxTokensPerStep"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers9TensorPtrE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers17generationLengthsE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::generationLengths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers11packedMasksE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::packedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers11positionIdsE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::positionIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime24LookaheadDecodingBuffers15positionOffsetsE", "tensorrt_llm::runtime::LookaheadDecodingBuffers::positionOffsets"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModuleE", "tensorrt_llm::runtime::LookaheadModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadModule::LookaheadModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleEv", "tensorrt_llm::runtime::LookaheadModule::LookaheadModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadModule::LookaheadModule::maxDecodingDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule15LookaheadModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadModule::LookaheadModule::maxDraftPathLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime15LookaheadModule18getExecutionConfigEv", "tensorrt_llm::runtime::LookaheadModule::getExecutionConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule16mExecutionConfigE", "tensorrt_llm::runtime::LookaheadModule::mExecutionConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule18setExecutionConfigERKN8executor23LookaheadDecodingConfigE", "tensorrt_llm::runtime::LookaheadModule::setExecutionConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15LookaheadModule18setExecutionConfigERKN8executor23LookaheadDecodingConfigE", "tensorrt_llm::runtime::LookaheadModule::setExecutionConfig::config"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffersE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::decodingConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::runtime"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23LookaheadRuntimeBuffersE10SizeType3210SizeType32RK13BufferManagerRK11ModelConfigRK11WorldConfigRKN8executor14DecodingConfigERK11TllmRuntime", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::LookaheadRuntimeBuffers::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers9TensorMapE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::TensorMap"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers9TensorPtrE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers18batchSlotsHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::batchSlotsHostCopy"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers12cumSumLengthE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::cumSumLength"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers24disableLookaheadDecodingEv", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::disableLookaheadDecoding"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23enableLookaheadDecodingE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::enableLookaheadDecoding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23enableLookaheadDecodingE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::enableLookaheadDecoding::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23enableLookaheadDecodingE10SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::enableLookaheadDecoding::tokensPerStep"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23generationLengthsDeviceE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::generationLengthsDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers21generationLengthsHostE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::generationLengthsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers25generationLengthsHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::generationLengthsHostCopy"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::insertInputTensors"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::insertInputTensors::inputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::insertInputTensors::outputBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers18insertInputTensorsER9TensorMapR9TensorMapRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::insertInputTensors::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers14packedMaskHostE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::packedMaskHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers18packedMaskHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::packedMaskHostCopy"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers17packedMasksDeviceE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::packedMasksDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers17positionIdsDeviceE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionIdsDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers15positionIdsHostE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionIdsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers19positionIdsHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionIdsHostCopy"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers21positionOffsetsDeviceE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionOffsetsDevice"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers19positionOffsetsHostE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionOffsetsHost"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers23positionOffsetsHostCopyE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::positionOffsetsHostCopy"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::reshape::numCtxSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::reshape::numGenSequences"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers7reshapeE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::reshape::tokensPerStep"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::decoderLookaheadBuffers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::modelConfig"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::numCtxSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::numGenSequences"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::requestTypes"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::runtime"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::seqSlots"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23LookaheadRuntimeBuffers13setFromInputsE10SizeType3210SizeType32RK7ITensorRK7ITensorRK24LookaheadDecodingBuffersRK11TllmRuntimeRK11ModelConfigRK11WorldConfig", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::setFromInputs::worldConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23LookaheadRuntimeBuffers15useSpecDecodingE", "tensorrt_llm::runtime::LookaheadRuntimeBuffers::useSpecDecoding"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCacheE", "tensorrt_llm::runtime::LoraCache"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache::pageManagerConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9LoraCacheERK26LoraCachePageManagerConfigRK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCache::LoraCache::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache10TaskIdTypeE", "tensorrt_llm::runtime::LoraCache::TaskIdType"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig11adapterSizeE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::adapterSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig6inSizeE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::inSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7layerIdE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::layerId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8moduleIdE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::moduleId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8numSlotsE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::numSlots"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigeqERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::operator=="], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfigeqERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::operator==::o"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7outSizeE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::outSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig6pageIdE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::pageId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig17scalingVecPointerE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::scalingVecPointer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig7slotIdxE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::slotIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig8toStringEv", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::toString"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig16weightsInPointerE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::weightsInPointer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21TaskLayerModuleConfig17weightsOutPointerE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfig::weightsOutPointer"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache28TaskLayerModuleConfigListPtrE", "tensorrt_llm::runtime::LoraCache::TaskLayerModuleConfigListPtr"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueE", "tensorrt_llm::runtime::LoraCache::TaskValue"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERR9TaskValue", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueEv", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::configs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::done"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::inProgress"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::it"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::loadInProgress"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::loaded"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERR9TaskValue", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::o"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue9TaskValueERKNSt6vectorINSt6size_tEEERK28TaskLayerModuleConfigListPtrNSt4listI10TaskIdTypeE8iteratorEbbbb", "tensorrt_llm::runtime::LoraCache::TaskValue::TaskValue::pageIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue7configsE", "tensorrt_llm::runtime::LoraCache::TaskValue::configs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue4doneE", "tensorrt_llm::runtime::LoraCache::TaskValue::done"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue10inProgressE", "tensorrt_llm::runtime::LoraCache::TaskValue::inProgress"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue2itE", "tensorrt_llm::runtime::LoraCache::TaskValue::it"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue14loadInProgressE", "tensorrt_llm::runtime::LoraCache::TaskValue::loadInProgress"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue6loadedE", "tensorrt_llm::runtime::LoraCache::TaskValue::loaded"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueaSERR9TaskValue", "tensorrt_llm::runtime::LoraCache::TaskValue::operator="], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueaSERR9TaskValue", "tensorrt_llm::runtime::LoraCache::TaskValue::operator=::o"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValue7pageIdsE", "tensorrt_llm::runtime::LoraCache::TaskValue::pageIds"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TaskValueD0Ev", "tensorrt_llm::runtime::LoraCache::TaskValue::~TaskValue"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12TaskValuePtrE", "tensorrt_llm::runtime::LoraCache::TaskValuePtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9TensorPtrE", "tensorrt_llm::runtime::LoraCache::TensorPtr"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatusE", "tensorrt_llm::runtime::LoraCache::ValueStatus"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus20kVALUE_STATUS_LOADEDE", "tensorrt_llm::runtime::LoraCache::ValueStatus::kVALUE_STATUS_LOADED"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus21kVALUE_STATUS_MISSINGE", "tensorrt_llm::runtime::LoraCache::ValueStatus::kVALUE_STATUS_MISSING"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus24kVALUE_STATUS_PROCESSINGE", "tensorrt_llm::runtime::LoraCache::ValueStatus::kVALUE_STATUS_PROCESSING"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache4bumpE10TaskIdType", "tensorrt_llm::runtime::LoraCache::bump"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache4bumpE10TaskIdType", "tensorrt_llm::runtime::LoraCache::bump::taskId"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18bumpTaskInProgressE10TaskIdType", "tensorrt_llm::runtime::LoraCache::bumpTaskInProgress"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18bumpTaskInProgressE10TaskIdType", "tensorrt_llm::runtime::LoraCache::bumpTaskInProgress::taskId"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache19claimPagesWithEvictE10SizeType32", "tensorrt_llm::runtime::LoraCache::claimPagesWithEvict"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache19claimPagesWithEvictE10SizeType32", "tensorrt_llm::runtime::LoraCache::claimPagesWithEvict::numPages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", "tensorrt_llm::runtime::LoraCache::copyTask"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", "tensorrt_llm::runtime::LoraCache::copyTask::deviceCache"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", "tensorrt_llm::runtime::LoraCache::copyTask::markDone"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache8copyTaskE10TaskIdTypeR9LoraCacheb", "tensorrt_llm::runtime::LoraCache::copyTask::taskId"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages::sourceTaskValue"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages::targetCache"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages::targetPageIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16copyTaskMapPagesER9TaskValueRK9TaskValueRKNSt6vectorI6size_tEERK9LoraCache", "tensorrt_llm::runtime::LoraCache::copyTaskMapPages::targetTaskValue"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::config"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::moduleIdToModel"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::pageIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::pages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::weights"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11copyToPagesE9TensorPtr9TensorPtrRK11ModelConfigRK11WorldConfigNSt13unordered_mapI10SizeType3210LoraModuleEERK13BufferManagerRKNSt6vectorI9TensorPtrEERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCache::copyToPages::worldConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE10TaskIdType", "tensorrt_llm::runtime::LoraCache::determineNumPages"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE9TensorPtr", "tensorrt_llm::runtime::LoraCache::determineNumPages"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE9TensorPtr", "tensorrt_llm::runtime::LoraCache::determineNumPages::config"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache17determineNumPagesE10TaskIdType", "tensorrt_llm::runtime::LoraCache::determineNumPages::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache4fitsE9TensorPtr", "tensorrt_llm::runtime::LoraCache::fits"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache4fitsE9TensorPtr", "tensorrt_llm::runtime::LoraCache::fits::config"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3getE10TaskIdType", "tensorrt_llm::runtime::LoraCache::get"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3getE10TaskIdType", "tensorrt_llm::runtime::LoraCache::get::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache11getNumPagesEv", "tensorrt_llm::runtime::LoraCache::getNumPages"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache10getPagePtrE6size_t", "tensorrt_llm::runtime::LoraCache::getPagePtr"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache10getPagePtrE6size_t", "tensorrt_llm::runtime::LoraCache::getPagePtr::pageId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache9getStatusE10TaskIdType", "tensorrt_llm::runtime::LoraCache::getStatus"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache9getStatusE10TaskIdType", "tensorrt_llm::runtime::LoraCache::getStatus::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache3hasE10TaskIdType", "tensorrt_llm::runtime::LoraCache::has"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache3hasE10TaskIdType", "tensorrt_llm::runtime::LoraCache::has::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache6isDoneE10TaskIdType", "tensorrt_llm::runtime::LoraCache::isDone"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache6isDoneE10TaskIdType", "tensorrt_llm::runtime::LoraCache::isDone::taskId"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache8isLoadedE10TaskIdType", "tensorrt_llm::runtime::LoraCache::isLoaded"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime9LoraCache8isLoadedE10TaskIdType", "tensorrt_llm::runtime::LoraCache::isLoaded::taskId"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus20kVALUE_STATUS_LOADEDE", "tensorrt_llm::runtime::LoraCache::kVALUE_STATUS_LOADED"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus21kVALUE_STATUS_MISSINGE", "tensorrt_llm::runtime::LoraCache::kVALUE_STATUS_MISSING"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11ValueStatus24kVALUE_STATUS_PROCESSINGE", "tensorrt_llm::runtime::LoraCache::kVALUE_STATUS_PROCESSING"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::cacheValue"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::config"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::config"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::taskId"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsE10TaskIdType9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::weights"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11loadWeightsER9TaskValue9TensorPtr9TensorPtr", "tensorrt_llm::runtime::LoraCache::loadWeights::weights"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache14mBufferManagerE", "tensorrt_llm::runtime::LoraCache::mBufferManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache9mCacheMapE", "tensorrt_llm::runtime::LoraCache::mCacheMap"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11mCacheMutexE", "tensorrt_llm::runtime::LoraCache::mCacheMutex"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17mCachePageManagerE", "tensorrt_llm::runtime::LoraCache::mCachePageManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache21mDeviceBufferManagersE", "tensorrt_llm::runtime::LoraCache::mDeviceBufferManagers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache10mDoneTasksE", "tensorrt_llm::runtime::LoraCache::mDoneTasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache16mInProgressTasksE", "tensorrt_llm::runtime::LoraCache::mInProgressTasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12mModelConfigE", "tensorrt_llm::runtime::LoraCache::mModelConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17mModuleIdToModuleE", "tensorrt_llm::runtime::LoraCache::mModuleIdToModule"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache18mPageManagerConfigE", "tensorrt_llm::runtime::LoraCache::mPageManagerConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11mPagesMutexE", "tensorrt_llm::runtime::LoraCache::mPagesMutex"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12mWorldConfigE", "tensorrt_llm::runtime::LoraCache::mWorldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache11markAllDoneEv", "tensorrt_llm::runtime::LoraCache::markAllDone"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12markTaskDoneE10TaskIdType", "tensorrt_llm::runtime::LoraCache::markTaskDone"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache12markTaskDoneE10TaskIdType", "tensorrt_llm::runtime::LoraCache::markTaskDone::taskId"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put::config"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put::load"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put::taskId"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache3putE10TaskIdType9TensorPtr9TensorPtrb", "tensorrt_llm::runtime::LoraCache::put::weights"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu::input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu::tpRank"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9LoraCache17splitTransposeCpuER7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpu::tpSize"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::input"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::output"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::tpRank"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime9LoraCache22splitTransposeCpuInnerEvR7ITensorRK7ITensor10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCache::splitTransposeCpuInner::tpSize"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullExceptionE", "tensorrt_llm::runtime::LoraCacheFullException"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullException22LoraCacheFullExceptionERKNSt6stringE", "tensorrt_llm::runtime::LoraCacheFullException::LoraCacheFullException"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullException22LoraCacheFullExceptionERKNSt6stringE", "tensorrt_llm::runtime::LoraCacheFullException::LoraCacheFullException::msg"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime22LoraCacheFullExceptionD0Ev", "tensorrt_llm::runtime::LoraCacheFullException::~LoraCacheFullException"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManagerE", "tensorrt_llm::runtime::LoraCachePageManager"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager20LoraCachePageManagerERK26LoraCachePageManagerConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::LoraCachePageManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager20LoraCachePageManagerERK26LoraCachePageManagerConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::LoraCachePageManager::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager20LoraCachePageManagerERK26LoraCachePageManagerConfigRK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::LoraCachePageManager::config"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager9TensorPtrE", "tensorrt_llm::runtime::LoraCachePageManager::TensorPtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager8blockPtrE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManager::blockPtr"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager8blockPtrE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManager::blockPtr::blockIdx"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10claimPagesE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManager::claimPages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10claimPagesE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManager::claimPages::numPages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10initializeERK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::initialize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager10initializeERK13BufferManager", "tensorrt_llm::runtime::LoraCachePageManager::initialize::bufferManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager7mConfigE", "tensorrt_llm::runtime::LoraCachePageManager::mConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12mFreePageIdsE", "tensorrt_llm::runtime::LoraCachePageManager::mFreePageIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager11mIsPageFreeE", "tensorrt_llm::runtime::LoraCachePageManager::mIsPageFree"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager11mPageBlocksE", "tensorrt_llm::runtime::LoraCachePageManager::mPageBlocks"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager14mutablePagePtrENSt6size_tE", "tensorrt_llm::runtime::LoraCachePageManager::mutablePagePtr"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager14mutablePagePtrENSt6size_tE", "tensorrt_llm::runtime::LoraCachePageManager::mutablePagePtr::pageIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager17numAvailablePagesEv", "tensorrt_llm::runtime::LoraCachePageManager::numAvailablePages"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager7pagePtrENSt6size_tE", "tensorrt_llm::runtime::LoraCachePageManager::pagePtr"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime20LoraCachePageManager7pagePtrENSt6size_tE", "tensorrt_llm::runtime::LoraCachePageManager::pagePtr::pageIdx"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12releasePagesERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCachePageManager::releasePages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20LoraCachePageManager12releasePagesERKNSt6vectorINSt6size_tEEE", "tensorrt_llm::runtime::LoraCachePageManager::releasePages::pages"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfigE", "tensorrt_llm::runtime::LoraCachePageManagerConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::dType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::maxPagesPerBlock"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::memType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::numCopyStreams"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::pageWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::slotsPerPage"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig26LoraCachePageManagerConfigEN7runtime10MemoryTypeEN8nvinfer18DataTypeE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::LoraCachePageManagerConfig::totalNumPages"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig11getDataTypeEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getDataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig13getInitToZeroEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getInitToZero"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig19getMaxPagesPerBlockEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getMaxPagesPerBlock"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig13getMemoryTypeEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getMemoryType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig17getNumCopyStreamsEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getNumCopyStreams"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig12getPageWidthEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getPageWidth"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig15getSlotsPerPageEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getSlotsPerPage"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime26LoraCachePageManagerConfig16getTotalNumPagesEv", "tensorrt_llm::runtime::LoraCachePageManagerConfig::getTotalNumPages"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig9mDataTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mDataType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11mInitToZeroE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mInitToZero"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17mMaxPagesPerBlockE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mMaxPagesPerBlock"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11mMemoryTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mMemoryType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15mNumCopyStreamsE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mNumCopyStreams"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig10mPageWidthE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mPageWidth"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13mSlotsPerPageE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mSlotsPerPage"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig14mTotalNumPagesE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::mTotalNumPages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11setDataTypeERKN8nvinfer18DataTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setDataType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig11setDataTypeERKN8nvinfer18DataTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setDataType::dtype"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setInitToZeroEb", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setInitToZero"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setInitToZeroEb", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setInitToZero::initToZero"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig19setMaxPagesPerBlockERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setMaxPagesPerBlock"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig19setMaxPagesPerBlockERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setMaxPagesPerBlock::maxPagesPerBlock"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setMemoryTypeERKN7runtime10MemoryTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setMemoryType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig13setMemoryTypeERKN7runtime10MemoryTypeE", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setMemoryType::memoryType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17setNumCopyStreamsE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setNumCopyStreams"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig17setNumCopyStreamsE10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setNumCopyStreams::numCopyStreams"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig12setPageWidthERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setPageWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig12setPageWidthERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setPageWidth::pageWidth"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setSlotsPerPageERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setSlotsPerPage"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setSlotsPerPageERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setSlotsPerPage::slotsPerPage"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setTotalNumPageERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setTotalNumPage"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime26LoraCachePageManagerConfig15setTotalNumPageERK10SizeType32", "tensorrt_llm::runtime::LoraCachePageManagerConfig::setTotalNumPage::totalNumPages"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedExceptionE", "tensorrt_llm::runtime::LoraExpectedException"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedException21LoraExpectedExceptionERKNSt6stringE", "tensorrt_llm::runtime::LoraExpectedException::LoraExpectedException"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedException21LoraExpectedExceptionERKNSt6stringE", "tensorrt_llm::runtime::LoraExpectedException::LoraExpectedException::msg"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime21LoraExpectedExceptionD0Ev", "tensorrt_llm::runtime::LoraExpectedException::~LoraExpectedException"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleE", "tensorrt_llm::runtime::LoraModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10LoraModule", "tensorrt_llm::runtime::LoraModule::LoraModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleEv", "tensorrt_llm::runtime::LoraModule::LoraModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::inDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::inDimFirst"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::inTpSplitDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10LoraModule", "tensorrt_llm::runtime::LoraModule::LoraModule::o"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::outDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::outDimFirst"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::outTpSplitDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10LoraModuleERK10ModuleType10SizeType3210SizeType32bb10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::LoraModule::t"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleTypeE", "tensorrt_llm::runtime::LoraModule::ModuleType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kATTN_DENSEE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_DENSE"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_KE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_K"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_QE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_Q"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kATTN_QKVE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_QKV"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType7kATTN_VE", "tensorrt_llm::runtime::LoraModule::ModuleType::kATTN_V"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType17kCROSS_ATTN_DENSEE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_DENSE"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_KE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_K"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_QE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_Q"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType15kCROSS_ATTN_QKVE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_QKV"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType13kCROSS_ATTN_VE", "tensorrt_llm::runtime::LoraModule::ModuleType::kCROSS_ATTN_V"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType8kINVALIDE", "tensorrt_llm::runtime::LoraModule::ModuleType::kINVALID"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_4H_TO_HE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_4H_TO_H"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kMLP_GATEE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_GATE"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_GATE_UPE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_GATE_UP"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMLP_H_TO_4HE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_H_TO_4H"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kMLP_ROUTERE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMLP_ROUTER"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMOE_4H_TO_HE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMOE_4H_TO_H"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType9kMOE_GATEE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMOE_GATE"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType12kMOE_H_TO_4HE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMOE_H_TO_4H"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule10ModuleType11kMOE_ROUTERE", "tensorrt_llm::runtime::LoraModule::ModuleType::kMOE_ROUTER"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule9TensorPtrE", "tensorrt_llm::runtime::LoraModule::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::attentionHeadSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::hiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::loraModuleNames"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::mlpHiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::numAttentionHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::numExperts"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::numKvAttentionHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule17createLoraModulesERKNSt6vectorINSt6stringEEE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::createLoraModules::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18flattenedInOutSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::flattenedInOutSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18flattenedInOutSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::flattenedInOutSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18flattenedInOutSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::flattenedInOutSize::isDora"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule5inDimEv", "tensorrt_llm::runtime::LoraModule::inDim"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10inDimFirstEv", "tensorrt_llm::runtime::LoraModule::inDimFirst"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6inSizeE10SizeType32", "tensorrt_llm::runtime::LoraModule::inSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6inSizeE10SizeType32", "tensorrt_llm::runtime::LoraModule::inSize::adapterSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12inTpSplitDimEv", "tensorrt_llm::runtime::LoraModule::inTpSplitDim"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18localInAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInAdapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18localInAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInAdapterSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule18localInAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInAdapterSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10localInDimE10SizeType32", "tensorrt_llm::runtime::LoraModule::localInDim"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule10localInDimE10SizeType32", "tensorrt_llm::runtime::LoraModule::localInDim::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localInOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInOutSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localInOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInOutSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localInOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInOutSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localInSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localInSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localInSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localInSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule19localOutAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutAdapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule19localOutAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutAdapterSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule19localOutAdapterSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutAdapterSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localOutDimE10SizeType32", "tensorrt_llm::runtime::LoraModule::localOutDim"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11localOutDimE10SizeType32", "tensorrt_llm::runtime::LoraModule::localOutDim::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12localOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12localOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule12localOutSizeE10SizeType3210SizeType32", "tensorrt_llm::runtime::LoraModule::localOutSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule15localScalesSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::localScalesSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule15localScalesSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::localScalesSize::isDora"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule15localScalesSizeE10SizeType32b", "tensorrt_llm::runtime::LoraModule::localScalesSize::tpSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::LoraModule::localTotalSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::LoraModule::localTotalSize::adapterSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::LoraModule::localTotalSize::isDora"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule14localTotalSizeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::LoraModule::localTotalSize::tpSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule6mInDimE", "tensorrt_llm::runtime::LoraModule::mInDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule11mInDimFirstE", "tensorrt_llm::runtime::LoraModule::mInDimFirst"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule13mInTpSplitDimE", "tensorrt_llm::runtime::LoraModule::mInTpSplitDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule7mOutDimE", "tensorrt_llm::runtime::LoraModule::mOutDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12mOutDimFirstE", "tensorrt_llm::runtime::LoraModule::mOutDimFirst"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule14mOutTpSplitDimE", "tensorrt_llm::runtime::LoraModule::mOutTpSplitDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule5mTypeE", "tensorrt_llm::runtime::LoraModule::mType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule4nameEv", "tensorrt_llm::runtime::LoraModule::name"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleaSERK10LoraModule", "tensorrt_llm::runtime::LoraModule::operator="], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModuleaSERK10LoraModule", "tensorrt_llm::runtime::LoraModule::operator=::o"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule6outDimEv", "tensorrt_llm::runtime::LoraModule::outDim"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule11outDimFirstEv", "tensorrt_llm::runtime::LoraModule::outDimFirst"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule7outSizeE10SizeType32", "tensorrt_llm::runtime::LoraModule::outSize"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule7outSizeE10SizeType32", "tensorrt_llm::runtime::LoraModule::outSize::adapterSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule13outTpSplitDimEv", "tensorrt_llm::runtime::LoraModule::outTpSplitDim"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10ModuleType", "tensorrt_llm::runtime::LoraModule::toModuleName"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10SizeType32", "tensorrt_llm::runtime::LoraModule::toModuleName"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10SizeType32", "tensorrt_llm::runtime::LoraModule::toModuleName::id"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleNameE10ModuleType", "tensorrt_llm::runtime::LoraModule::toModuleName::t"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleTypeERKNSt11string_viewE", "tensorrt_llm::runtime::LoraModule::toModuleType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10LoraModule12toModuleTypeERKNSt11string_viewE", "tensorrt_llm::runtime::LoraModule::toModuleType::name"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime10LoraModule5valueEv", "tensorrt_llm::runtime::LoraModule::value"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14LoraTaskIdTypeE", "tensorrt_llm::runtime::LoraTaskIdType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime17MPI_group_barrierENSt3setIiEE", "tensorrt_llm::runtime::MPI_group_barrier"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime17MPI_group_barrierENSt3setIiEE", "tensorrt_llm::runtime::MPI_group_barrier::ranks"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModuleE", "tensorrt_llm::runtime::MedusaModule"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule13MedusaChoicesE", "tensorrt_llm::runtime::MedusaModule::MedusaChoices"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::MedusaModule::MedusaModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleEv", "tensorrt_llm::runtime::MedusaModule::MedusaModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::MedusaModule::MedusaModule::maxAcceptedTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule12MedusaModuleE10SizeType3210SizeType32", "tensorrt_llm::runtime::MedusaModule::MedusaModule::maxDraftTokens"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule9TensorPtrE", "tensorrt_llm::runtime::MedusaModule::TensorPtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime12MedusaModule16getMedusaChoicesEv", "tensorrt_llm::runtime::MedusaModule::getMedusaChoices"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime12MedusaModule21mDefaultMedusaChoicesE", "tensorrt_llm::runtime::MedusaModule::mDefaultMedusaChoices"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCountersE", "tensorrt_llm::runtime::MemoryCounters"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8DiffTypeE", "tensorrt_llm::runtime::MemoryCounters::DiffType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters14MemoryCountersEv", "tensorrt_llm::runtime::MemoryCounters::MemoryCounters"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10SizeType32E", "tensorrt_llm::runtime::MemoryCounters::SizeType32"], [1, 3, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters8allocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8allocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate"], [1, 8, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters8allocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate::T"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8allocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate::memoryType"], [1, 4, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters8allocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8allocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::allocate::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE10SizeType32i", "tensorrt_llm::runtime::MemoryCounters::bytesToString"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE8DiffTypei", "tensorrt_llm::runtime::MemoryCounters::bytesToString"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE10SizeType32i", "tensorrt_llm::runtime::MemoryCounters::bytesToString::bytes"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE8DiffTypei", "tensorrt_llm::runtime::MemoryCounters::bytesToString::bytes"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE10SizeType32i", "tensorrt_llm::runtime::MemoryCounters::bytesToString::precision"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters13bytesToStringE8DiffTypei", "tensorrt_llm::runtime::MemoryCounters::bytesToString::precision"], [1, 3, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters10deallocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10deallocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate"], [1, 8, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters10deallocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate::T"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10deallocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate::memoryType"], [1, 4, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime14MemoryCounters10deallocateEv10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate::size"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters10deallocateE10MemoryType10SizeType32", "tensorrt_llm::runtime::MemoryCounters::deallocate::size"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getCpuEv", "tensorrt_llm::runtime::MemoryCounters::getCpu"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getCpuDiffEv", "tensorrt_llm::runtime::MemoryCounters::getCpuDiff"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getGpuEv", "tensorrt_llm::runtime::MemoryCounters::getGpu"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getGpuDiffEv", "tensorrt_llm::runtime::MemoryCounters::getGpuDiff"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11getInstanceEv", "tensorrt_llm::runtime::MemoryCounters::getInstance"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters9getPinnedEv", "tensorrt_llm::runtime::MemoryCounters::getPinned"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters13getPinnedDiffEv", "tensorrt_llm::runtime::MemoryCounters::getPinnedDiff"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters13getPinnedPoolEv", "tensorrt_llm::runtime::MemoryCounters::getPinnedPool"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters17getPinnedPoolDiffEv", "tensorrt_llm::runtime::MemoryCounters::getPinnedPoolDiff"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters6getUVMEv", "tensorrt_llm::runtime::MemoryCounters::getUVM"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters10getUVMDiffEv", "tensorrt_llm::runtime::MemoryCounters::getUVMDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mCpuE", "tensorrt_llm::runtime::MemoryCounters::mCpu"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mCpuDiffE", "tensorrt_llm::runtime::MemoryCounters::mCpuDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mGpuE", "tensorrt_llm::runtime::MemoryCounters::mGpu"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mGpuDiffE", "tensorrt_llm::runtime::MemoryCounters::mGpuDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters7mPinnedE", "tensorrt_llm::runtime::MemoryCounters::mPinned"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11mPinnedDiffE", "tensorrt_llm::runtime::MemoryCounters::mPinnedDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters11mPinnedPoolE", "tensorrt_llm::runtime::MemoryCounters::mPinnedPool"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters15mPinnedPoolDiffE", "tensorrt_llm::runtime::MemoryCounters::mPinnedPoolDiff"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters4mUVME", "tensorrt_llm::runtime::MemoryCounters::mUVM"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14MemoryCounters8mUVMDiffE", "tensorrt_llm::runtime::MemoryCounters::mUVMDiff"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14MemoryCounters8toStringEv", "tensorrt_llm::runtime::MemoryCounters::toString"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryTypeE", "tensorrt_llm::runtime::MemoryType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kCPUE", "tensorrt_llm::runtime::MemoryType::kCPU"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kGPUE", "tensorrt_llm::runtime::MemoryType::kGPU"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType7kPINNEDE", "tensorrt_llm::runtime::MemoryType::kPINNED"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType11kPINNEDPOOLE", "tensorrt_llm::runtime::MemoryType::kPINNEDPOOL"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime10MemoryType4kUVME", "tensorrt_llm::runtime::MemoryType::kUVM"], [1, 2, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime16MemoryTypeStringE", "tensorrt_llm::runtime::MemoryTypeString"], [1, 8, 1, "_CPPv4I_10MemoryTypeEN12tensorrt_llm7runtime16MemoryTypeStringE", "tensorrt_llm::runtime::MemoryTypeString::T"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kCPUEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kCPU&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kCPUEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kCPU&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kGPUEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kGPU&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kGPUEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kGPU&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType7kPINNEDEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kPINNED&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType7kPINNEDEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kPINNED&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType11kPINNEDPOOLEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kPINNEDPOOL&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType11kPINNEDPOOLEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kPINNEDPOOL&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kUVMEEE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kUVM&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime16MemoryTypeStringIN10MemoryType4kUVMEE5valueE", "tensorrt_llm::runtime::MemoryTypeString&lt;MemoryType::kUVM&gt;::value"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfigE", "tensorrt_llm::runtime::ModelConfig"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheTypeE", "tensorrt_llm::runtime::ModelConfig::KVCacheType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType11kCONTINUOUSE", "tensorrt_llm::runtime::ModelConfig::KVCacheType::kCONTINUOUS"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType9kDISABLEDE", "tensorrt_llm::runtime::ModelConfig::KVCacheType::kDISABLED"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11KVCacheType6kPAGEDE", "tensorrt_llm::runtime::ModelConfig::KVCacheType::kPAGED"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21KVCacheTypeFromStringENSt6stringE", "tensorrt_llm::runtime::ModelConfig::KVCacheTypeFromString"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21KVCacheTypeFromStringENSt6stringE", "tensorrt_llm::runtime::ModelConfig::KVCacheTypeFromString::value"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerTypeE", "tensorrt_llm::runtime::ModelConfig::LayerType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType10kATTENTIONE", "tensorrt_llm::runtime::ModelConfig::LayerType::kATTENTION"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType7kLINEARE", "tensorrt_llm::runtime::ModelConfig::LayerType::kLINEAR"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType5kNOOPE", "tensorrt_llm::runtime::ModelConfig::LayerType::kNOOP"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9LayerType10kRECURRENTE", "tensorrt_llm::runtime::ModelConfig::LayerType::kRECURRENT"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsTypeE", "tensorrt_llm::runtime::ModelConfig::ManageWeightsType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsType9kDisabledE", "tensorrt_llm::runtime::ModelConfig::ManageWeightsType::kDisabled"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17ManageWeightsType8kEnabledE", "tensorrt_llm::runtime::ModelConfig::ManageWeightsType::kEnabled"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::hiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::nbAttentionLayers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::nbHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::nbLayers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::nbRnnLayers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11ModelConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType3210SizeType32N8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::ModelConfig::vocabSize"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariantE", "tensorrt_llm::runtime::ModelConfig::ModelVariant"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant8kChatGlmE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kChatGlm"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant7kEncDecE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kEncDec"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant4kGlmE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kGlm"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant4kGptE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kGpt"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant6kMambaE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kMamba"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12ModelVariant15kRecurrentGemmaE", "tensorrt_llm::runtime::ModelConfig::ModelVariant::kRecurrentGemma"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfigE", "tensorrt_llm::runtime::ModelConfig::RnnConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig10convKernelE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::convKernel"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig14rnnConvDimSizeE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::rnnConvDimSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig11rnnHeadSizeE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::rnnHeadSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig13rnnHiddenSizeE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::rnnHiddenSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9RnnConfig9stateSizeE", "tensorrt_llm::runtime::ModelConfig::RnnConfig::stateSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEb", "tensorrt_llm::runtime::ModelConfig::computeContextLogits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEv", "tensorrt_llm::runtime::ModelConfig::computeContextLogits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20computeContextLogitsEb", "tensorrt_llm::runtime::ModelConfig::computeContextLogits::computeContextLogits"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEb", "tensorrt_llm::runtime::ModelConfig::computeGenerationLogits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEv", "tensorrt_llm::runtime::ModelConfig::computeGenerationLogits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23computeGenerationLogitsEb", "tensorrt_llm::runtime::ModelConfig::computeGenerationLogits::computeGenerationLogits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLocalLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLocalLayers::layerType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLocalLayers::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16countLocalLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLocalLayers::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLowerRankLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLowerRankLayers::layerType"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLowerRankLayers::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20countLowerRankLayersE9LayerType10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::countLowerRankLayers::pipelineParallelismRank"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig32disableSeamlessLookaheadDecodingEv", "tensorrt_llm::runtime::ModelConfig::disableSeamlessLookaheadDecoding"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31enableSeamlessLookaheadDecodingE10SizeType32", "tensorrt_llm::runtime::ModelConfig::enableSeamlessLookaheadDecoding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31enableSeamlessLookaheadDecodingE10SizeType32", "tensorrt_llm::runtime::ModelConfig::enableSeamlessLookaheadDecoding::maxDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getContextFMHAEv", "tensorrt_llm::runtime::ModelConfig::getContextFMHA"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getDataTypeEv", "tensorrt_llm::runtime::ModelConfig::getDataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getEncoderHiddenSizeEv", "tensorrt_llm::runtime::ModelConfig::getEncoderHiddenSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getFirstLocalLayer"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getFirstLocalLayer::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getFirstLocalLayerE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getFirstLocalLayer::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getGemmAllReduceDtypeEv", "tensorrt_llm::runtime::ModelConfig::getGemmAllReduceDtype"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getHiddenSizeEv", "tensorrt_llm::runtime::ModelConfig::getHiddenSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getKVCacheTypeEv", "tensorrt_llm::runtime::ModelConfig::getKVCacheType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getKvDataTypeEv", "tensorrt_llm::runtime::ModelConfig::getKvDataType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13getLayerTypesEv", "tensorrt_llm::runtime::ModelConfig::getLayerTypes"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getLogitsDtypeEv", "tensorrt_llm::runtime::ModelConfig::getLogitsDtype"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getLoraModulesEv", "tensorrt_llm::runtime::ModelConfig::getLoraModules"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getManageWeightsTypeEv", "tensorrt_llm::runtime::ModelConfig::getManageWeightsType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxBatchSizeEv", "tensorrt_llm::runtime::ModelConfig::getMaxBatchSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxBeamWidthEv", "tensorrt_llm::runtime::ModelConfig::getMaxBeamWidth"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig25getMaxDecodingDraftTokensEv", "tensorrt_llm::runtime::ModelConfig::getMaxDecodingDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getMaxDecodingTokensEv", "tensorrt_llm::runtime::ModelConfig::getMaxDecodingTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16getMaxEncoderLenEv", "tensorrt_llm::runtime::ModelConfig::getMaxEncoderLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getMaxInputLenEv", "tensorrt_llm::runtime::ModelConfig::getMaxInputLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getMaxLoraRankEv", "tensorrt_llm::runtime::ModelConfig::getMaxLoraRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getMaxNumTokensEv", "tensorrt_llm::runtime::ModelConfig::getMaxNumTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig24getMaxPositionEmbeddingsEv", "tensorrt_llm::runtime::ModelConfig::getMaxPositionEmbeddings"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig30getMaxPromptEmbeddingTableSizeEv", "tensorrt_llm::runtime::ModelConfig::getMaxPromptEmbeddingTableSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17getMaxSequenceLenEv", "tensorrt_llm::runtime::ModelConfig::getMaxSequenceLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16getMlpHiddenSizeEv", "tensorrt_llm::runtime::ModelConfig::getMlpHiddenSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getModelNameEv", "tensorrt_llm::runtime::ModelConfig::getModelName"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getModelVariantEv", "tensorrt_llm::runtime::ModelConfig::getModelVariant"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbAttentionLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbAttentionLayers::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20getNbAttentionLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbAttentionLayers::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig10getNbHeadsEv", "tensorrt_llm::runtime::ModelConfig::getNbHeads"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getNbKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbKvHeads"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getNbKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbKvHeads::layerIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbLayers::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig11getNbLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbLayers::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbRnnLayers"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbRnnLayers::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getNbRnnLayersE10SizeType3210SizeType32", "tensorrt_llm::runtime::ModelConfig::getNbRnnLayers::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getNumKvHeadsPerLayerEv", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayer"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayerLocalRange"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayerLocalRange::isCrossAttention"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayerLocalRange::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getNumKvHeadsPerLayerLocalRangeE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getNumKvHeadsPerLayerLocalRange::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15getNumLanguagesEv", "tensorrt_llm::runtime::ModelConfig::getNumLanguages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig25getOptProfilesSplitPointsEv", "tensorrt_llm::runtime::ModelConfig::getOptProfilesSplitPoints"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19getPagedContextFMHAEv", "tensorrt_llm::runtime::ModelConfig::getPagedContextFMHA"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getPpReduceScatterEv", "tensorrt_llm::runtime::ModelConfig::getPpReduceScatter"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getQuantModeEv", "tensorrt_llm::runtime::ModelConfig::getQuantMode"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getRnnConfigEv", "tensorrt_llm::runtime::ModelConfig::getRnnConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21getRotaryEmbeddingDimEv", "tensorrt_llm::runtime::ModelConfig::getRotaryEmbeddingDim"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14getSizePerHeadEv", "tensorrt_llm::runtime::ModelConfig::getSizePerHead"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig26getSpeculativeDecodingModeEv", "tensorrt_llm::runtime::ModelConfig::getSpeculativeDecodingMode"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig28getSpeculativeDecodingModuleEv", "tensorrt_llm::runtime::ModelConfig::getSpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig31getSpeculativeDecodingModulePtrEv", "tensorrt_llm::runtime::ModelConfig::getSpeculativeDecodingModulePtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig31getSpeculativeDecodingModulePtrEv", "tensorrt_llm::runtime::ModelConfig::getSpeculativeDecodingModulePtr"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getSumLocalKvHeads"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getSumLocalKvHeads::isCrossAttention"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getSumLocalKvHeads::pipelineParallelism"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getSumLocalKvHeadsE10SizeType3210SizeType32b", "tensorrt_llm::runtime::ModelConfig::getSumLocalKvHeads::pipelineParallelismRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17getTokensPerBlockEv", "tensorrt_llm::runtime::ModelConfig::getTokensPerBlock"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12getVocabSizeEv", "tensorrt_llm::runtime::ModelConfig::getVocabSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getVocabSizePaddedE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getVocabSizePadded"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18getVocabSizePaddedE10SizeType32", "tensorrt_llm::runtime::ModelConfig::getVocabSizePadded::worldSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12hasRnnConfigEv", "tensorrt_llm::runtime::ModelConfig::hasRnnConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig28hasSpeculativeDecodingModuleEv", "tensorrt_llm::runtime::ModelConfig::hasSpeculativeDecodingModule"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19isContinuousKVCacheEv", "tensorrt_llm::runtime::ModelConfig::isContinuousKVCache"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig16isKVCacheEnabledEv", "tensorrt_llm::runtime::ModelConfig::isKVCacheEnabled"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig12isMultiModalEv", "tensorrt_llm::runtime::ModelConfig::isMultiModal"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14isPagedKVCacheEv", "tensorrt_llm::runtime::ModelConfig::isPagedKVCache"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig10isRnnBasedEv", "tensorrt_llm::runtime::ModelConfig::isRnnBased"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18isTransformerBasedEv", "tensorrt_llm::runtime::ModelConfig::isTransformerBased"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig9isWhisperEv", "tensorrt_llm::runtime::ModelConfig::isWhisper"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig29kDEFAULT_NUM_TOKENS_PER_BLOCKE", "tensorrt_llm::runtime::ModelConfig::kDEFAULT_NUM_TOKENS_PER_BLOCK"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26kOPT_PROFILES_SPLIT_POINTSE", "tensorrt_llm::runtime::ModelConfig::kOPT_PROFILES_SPLIT_POINTS"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mComputeContextLogitsE", "tensorrt_llm::runtime::ModelConfig::mComputeContextLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24mComputeGenerationLogitsE", "tensorrt_llm::runtime::ModelConfig::mComputeGenerationLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mContextFMHAE", "tensorrt_llm::runtime::ModelConfig::mContextFMHA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mDataTypeE", "tensorrt_llm::runtime::ModelConfig::mDataType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mEncoderHiddenSizeE", "tensorrt_llm::runtime::ModelConfig::mEncoderHiddenSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19mGemmAllReduceDtypeE", "tensorrt_llm::runtime::ModelConfig::mGemmAllReduceDtype"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mHiddenSizeE", "tensorrt_llm::runtime::ModelConfig::mHiddenSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mInputPackedE", "tensorrt_llm::runtime::ModelConfig::mInputPacked"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mKVCacheTypeE", "tensorrt_llm::runtime::ModelConfig::mKVCacheType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mLayerTypesE", "tensorrt_llm::runtime::ModelConfig::mLayerTypes"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mLogitsDtypeE", "tensorrt_llm::runtime::ModelConfig::mLogitsDtype"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mLoraModulesE", "tensorrt_llm::runtime::ModelConfig::mLoraModules"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mManageWeightsTypeE", "tensorrt_llm::runtime::ModelConfig::mManageWeightsType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxBatchSizeE", "tensorrt_llm::runtime::ModelConfig::mMaxBatchSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxBeamWidthE", "tensorrt_llm::runtime::ModelConfig::mMaxBeamWidth"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mMaxEncoderLenE", "tensorrt_llm::runtime::ModelConfig::mMaxEncoderLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mMaxInputLenE", "tensorrt_llm::runtime::ModelConfig::mMaxInputLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mMaxLoraRankE", "tensorrt_llm::runtime::ModelConfig::mMaxLoraRank"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mMaxNumTokensE", "tensorrt_llm::runtime::ModelConfig::mMaxNumTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mMaxPositionEmbeddingsE", "tensorrt_llm::runtime::ModelConfig::mMaxPositionEmbeddings"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28mMaxPromptEmbeddingTableSizeE", "tensorrt_llm::runtime::ModelConfig::mMaxPromptEmbeddingTableSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15mMaxSequenceLenE", "tensorrt_llm::runtime::ModelConfig::mMaxSequenceLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mMlpHiddenSizeE", "tensorrt_llm::runtime::ModelConfig::mMlpHiddenSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mModelNameE", "tensorrt_llm::runtime::ModelConfig::mModelName"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mModelVariantE", "tensorrt_llm::runtime::ModelConfig::mModelVariant"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mNbAttentionLayersE", "tensorrt_llm::runtime::ModelConfig::mNbAttentionLayers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig8mNbHeadsE", "tensorrt_llm::runtime::ModelConfig::mNbHeads"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mNbLayersE", "tensorrt_llm::runtime::ModelConfig::mNbLayers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mNbRnnLayersE", "tensorrt_llm::runtime::ModelConfig::mNbRnnLayers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28mNumKvHeadsPerAttentionLayerE", "tensorrt_llm::runtime::ModelConfig::mNumKvHeadsPerAttentionLayer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig33mNumKvHeadsPerCrossAttentionLayerE", "tensorrt_llm::runtime::ModelConfig::mNumKvHeadsPerCrossAttentionLayer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13mNumLanguagesE", "tensorrt_llm::runtime::ModelConfig::mNumLanguages"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17mPagedContextFMHAE", "tensorrt_llm::runtime::ModelConfig::mPagedContextFMHA"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11mPagedStateE", "tensorrt_llm::runtime::ModelConfig::mPagedState"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16mPpReduceScatterE", "tensorrt_llm::runtime::ModelConfig::mPpReduceScatter"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mQuantModeE", "tensorrt_llm::runtime::ModelConfig::mQuantMode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mRnnConfigE", "tensorrt_llm::runtime::ModelConfig::mRnnConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19mRotaryEmbeddingDimE", "tensorrt_llm::runtime::ModelConfig::mRotaryEmbeddingDim"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12mSizePerHeadE", "tensorrt_llm::runtime::ModelConfig::mSizePerHead"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20mSkipCrossAttnBlocksE", "tensorrt_llm::runtime::ModelConfig::mSkipCrossAttnBlocks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24mSpeculativeDecodingModeE", "tensorrt_llm::runtime::ModelConfig::mSpeculativeDecodingMode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26mSpeculativeDecodingModuleE", "tensorrt_llm::runtime::ModelConfig::mSpeculativeDecodingModule"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15mTokensPerBlockE", "tensorrt_llm::runtime::ModelConfig::mTokensPerBlock"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mUseCrossAttentionE", "tensorrt_llm::runtime::ModelConfig::mUseCrossAttention"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23mUseGemmAllReducePluginE", "tensorrt_llm::runtime::ModelConfig::mUseGemmAllReducePlugin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mUseGptAttentionPluginE", "tensorrt_llm::runtime::ModelConfig::mUseGptAttentionPlugin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14mUseLoraPluginE", "tensorrt_llm::runtime::ModelConfig::mUseLoraPlugin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mUseMambaConv1dPluginE", "tensorrt_llm::runtime::ModelConfig::mUseMambaConv1dPlugin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig9mUseMropeE", "tensorrt_llm::runtime::ModelConfig::mUseMrope"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21mUsePositionEmbeddingE", "tensorrt_llm::runtime::ModelConfig::mUsePositionEmbedding"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18mUseShapeInferenceE", "tensorrt_llm::runtime::ModelConfig::mUseShapeInference"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22mUseTokenTypeEmbeddingE", "tensorrt_llm::runtime::ModelConfig::mUseTokenTypeEmbedding"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig10mVocabSizeE", "tensorrt_llm::runtime::ModelConfig::mVocabSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30resetSpeculativeDecodingModuleEv", "tensorrt_llm::runtime::ModelConfig::resetSpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setContextFMHAEb", "tensorrt_llm::runtime::ModelConfig::setContextFMHA"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setContextFMHAEb", "tensorrt_llm::runtime::ModelConfig::setContextFMHA::contextFMHA"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setEncoderHiddenSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setEncoderHiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setEncoderHiddenSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setEncoderHiddenSize::encoderHiddenSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setGemmAllReduceDtypeEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::setGemmAllReduceDtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setGemmAllReduceDtypeEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::setGemmAllReduceDtype::inputDtype"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setKVCacheTypeE11KVCacheType", "tensorrt_llm::runtime::ModelConfig::setKVCacheType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setKVCacheTypeE11KVCacheType", "tensorrt_llm::runtime::ModelConfig::setKVCacheType::kvCacheType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13setLayerTypesERKNSt6vectorI9LayerTypeEE", "tensorrt_llm::runtime::ModelConfig::setLayerTypes"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13setLayerTypesERKNSt6vectorI9LayerTypeEE", "tensorrt_llm::runtime::ModelConfig::setLayerTypes::layerTypes"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLogitsDtypeEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::setLogitsDtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLogitsDtypeEN8nvinfer18DataTypeE", "tensorrt_llm::runtime::ModelConfig::setLogitsDtype::inputDtype"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLoraModulesERKNSt6vectorI10LoraModuleEE", "tensorrt_llm::runtime::ModelConfig::setLoraModules"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setLoraModulesERKNSt6vectorI10LoraModuleEE", "tensorrt_llm::runtime::ModelConfig::setLoraModules::loraModules"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setManageWeightsTypeEK17ManageWeightsType", "tensorrt_llm::runtime::ModelConfig::setManageWeightsType"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setManageWeightsTypeEK17ManageWeightsType", "tensorrt_llm::runtime::ModelConfig::setManageWeightsType::manageWeightType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBatchSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBatchSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxBatchSize::maxBatchSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBeamWidthE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxBeamWidthE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxBeamWidth::maxBeamWidth"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMaxEncoderLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxEncoderLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMaxEncoderLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxEncoderLen::maxEncoderLen"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxInputLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxInputLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxInputLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxInputLen::maxInputLen"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxLoraRankE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxLoraRank"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setMaxLoraRankE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxLoraRank::maxLoraRank"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxNumTokensENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setMaxNumTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setMaxNumTokensENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setMaxNumTokens::maxNumTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setMaxPositionEmbeddingsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxPositionEmbeddings"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setMaxPositionEmbeddingsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxPositionEmbeddings::maxPositionEmbeddings"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30setMaxPromptEmbeddingTableSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxPromptEmbeddingTableSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig30setMaxPromptEmbeddingTableSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxPromptEmbeddingTableSize::maxPromptEmbeddingTableSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setMaxSequenceLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxSequenceLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setMaxSequenceLenE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMaxSequenceLen::maxSequenceLen"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMlpHiddenSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMlpHiddenSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig16setMlpHiddenSizeE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setMlpHiddenSize::mlpHiddenSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setModelNameERKNSt6stringE", "tensorrt_llm::runtime::ModelConfig::setModelName"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setModelNameERKNSt6stringE", "tensorrt_llm::runtime::ModelConfig::setModelName::modelName"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setModelVariantE12ModelVariant", "tensorrt_llm::runtime::ModelConfig::setModelVariant"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setModelVariantE12ModelVariant", "tensorrt_llm::runtime::ModelConfig::setModelVariant::modelVariant"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setNbCrossKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setNbCrossKvHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setNbCrossKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setNbCrossKvHeads::nbKvHeads"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setNbKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setNbKvHeads"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setNbKvHeadsE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setNbKvHeads::nbKvHeads"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setNumKvHeadsPerCrossLayerERKNSt6vectorI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumKvHeadsPerCrossLayer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setNumKvHeadsPerCrossLayerERKNSt6vectorI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumKvHeadsPerCrossLayer::headsPerLayer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setNumKvHeadsPerLayerERKNSt6vectorI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumKvHeadsPerLayer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setNumKvHeadsPerLayerERKNSt6vectorI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumKvHeadsPerLayer::headsPerLayer"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setNumLanguagesENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumLanguages"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig15setNumLanguagesENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::ModelConfig::setNumLanguages::numLanguages"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19setPagedContextFMHAEb", "tensorrt_llm::runtime::ModelConfig::setPagedContextFMHA"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig19setPagedContextFMHAEb", "tensorrt_llm::runtime::ModelConfig::setPagedContextFMHA::pagedContextFMHA"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18setPpReduceScatterEb", "tensorrt_llm::runtime::ModelConfig::setPpReduceScatter"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig18setPpReduceScatterEb", "tensorrt_llm::runtime::ModelConfig::setPpReduceScatter::ppReduceScatter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setQuantModeEN6common9QuantModeE", "tensorrt_llm::runtime::ModelConfig::setQuantMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setQuantModeEN6common9QuantModeE", "tensorrt_llm::runtime::ModelConfig::setQuantMode::QuantMode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setRnnConfigERK9RnnConfig", "tensorrt_llm::runtime::ModelConfig::setRnnConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig12setRnnConfigERK9RnnConfig", "tensorrt_llm::runtime::ModelConfig::setRnnConfig::rnnConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setRotaryEmbeddingDimE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setRotaryEmbeddingDim"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21setRotaryEmbeddingDimE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setRotaryEmbeddingDim::rotaryEmbeddingDim"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setSizePerHeadE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setSizePerHead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14setSizePerHeadE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setSizePerHead::sizePerHead"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22setSkipCrossAttnBlocksEb", "tensorrt_llm::runtime::ModelConfig::setSkipCrossAttnBlocks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22setSkipCrossAttnBlocksEb", "tensorrt_llm::runtime::ModelConfig::setSkipCrossAttnBlocks::skipCrossAttnBlocks"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setSpeculativeDecodingModeE23SpeculativeDecodingMode", "tensorrt_llm::runtime::ModelConfig::setSpeculativeDecodingMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig26setSpeculativeDecodingModeE23SpeculativeDecodingMode", "tensorrt_llm::runtime::ModelConfig::setSpeculativeDecodingMode::mode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28setSpeculativeDecodingModuleERKNSt10shared_ptrI25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::ModelConfig::setSpeculativeDecodingModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig28setSpeculativeDecodingModuleERKNSt10shared_ptrI25SpeculativeDecodingModuleEE", "tensorrt_llm::runtime::ModelConfig::setSpeculativeDecodingModule::speculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setTokensPerBlockE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setTokensPerBlock"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig17setTokensPerBlockE10SizeType32", "tensorrt_llm::runtime::ModelConfig::setTokensPerBlock::TokensPerBlock"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseCrossAttentionEb", "tensorrt_llm::runtime::ModelConfig::setUseCrossAttention"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseCrossAttentionEb", "tensorrt_llm::runtime::ModelConfig::setUseCrossAttention::useCrossAttention"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11setUseMropeEb", "tensorrt_llm::runtime::ModelConfig::setUseMrope"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig11setUseMropeEb", "tensorrt_llm::runtime::ModelConfig::setUseMrope::useMrope"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23setUsePositionEmbeddingEb", "tensorrt_llm::runtime::ModelConfig::setUsePositionEmbedding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig23setUsePositionEmbeddingEb", "tensorrt_llm::runtime::ModelConfig::setUsePositionEmbedding::usePositionEmbedding"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseShapeInferenceEb", "tensorrt_llm::runtime::ModelConfig::setUseShapeInference"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20setUseShapeInferenceEb", "tensorrt_llm::runtime::ModelConfig::setUseShapeInference::useShapeInference"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setUseTokenTypeEmbeddingEb", "tensorrt_llm::runtime::ModelConfig::setUseTokenTypeEmbedding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig24setUseTokenTypeEmbeddingEb", "tensorrt_llm::runtime::ModelConfig::setUseTokenTypeEmbedding::useTokenTypeEmbedding"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig19skipCrossAttnBlocksEv", "tensorrt_llm::runtime::ModelConfig::skipCrossAttnBlocks"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig24supportsInflightBatchingEv", "tensorrt_llm::runtime::ModelConfig::supportsInflightBatching"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17useCrossAttentionEv", "tensorrt_llm::runtime::ModelConfig::useCrossAttention"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEb", "tensorrt_llm::runtime::ModelConfig::useGemmAllReducePlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEv", "tensorrt_llm::runtime::ModelConfig::useGemmAllReducePlugin"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig22useGemmAllReducePluginEb", "tensorrt_llm::runtime::ModelConfig::useGemmAllReducePlugin::useGemmAllReducePlugin"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEb", "tensorrt_llm::runtime::ModelConfig::useGptAttentionPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEv", "tensorrt_llm::runtime::ModelConfig::useGptAttentionPlugin"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig21useGptAttentionPluginEb", "tensorrt_llm::runtime::ModelConfig::useGptAttentionPlugin::useGptAttentionPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig18useLanguageAdapterEv", "tensorrt_llm::runtime::ModelConfig::useLanguageAdapter"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13useLoraPluginEb", "tensorrt_llm::runtime::ModelConfig::useLoraPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13useLoraPluginEv", "tensorrt_llm::runtime::ModelConfig::useLoraPlugin"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13useLoraPluginEb", "tensorrt_llm::runtime::ModelConfig::useLoraPlugin::useLoraPlugin"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEb", "tensorrt_llm::runtime::ModelConfig::useMambaConv1dPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEv", "tensorrt_llm::runtime::ModelConfig::useMambaConv1dPlugin"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig20useMambaConv1dPluginEb", "tensorrt_llm::runtime::ModelConfig::useMambaConv1dPlugin::useMambaConv1dPlugin"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig8useMropeEv", "tensorrt_llm::runtime::ModelConfig::useMrope"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14usePackedInputEb", "tensorrt_llm::runtime::ModelConfig::usePackedInput"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig14usePackedInputEv", "tensorrt_llm::runtime::ModelConfig::usePackedInput"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig14usePackedInputEb", "tensorrt_llm::runtime::ModelConfig::usePackedInput::inputPacked"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13usePagedStateEb", "tensorrt_llm::runtime::ModelConfig::usePagedState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig13usePagedStateEv", "tensorrt_llm::runtime::ModelConfig::usePagedState"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ModelConfig13usePagedStateEb", "tensorrt_llm::runtime::ModelConfig::usePagedState::pagedState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig20usePositionEmbeddingEv", "tensorrt_llm::runtime::ModelConfig::usePositionEmbedding"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig15usePromptTuningEv", "tensorrt_llm::runtime::ModelConfig::usePromptTuning"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig17useShapeInferenceEv", "tensorrt_llm::runtime::ModelConfig::useShapeInference"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11ModelConfig21useTokenTypeEmbeddingEv", "tensorrt_llm::runtime::ModelConfig::useTokenTypeEmbedding"], [1, 1, 1, "_CPPv4I0EN12tensorrt_llm7runtime18PointerElementTypeE", "tensorrt_llm::runtime::PointerElementType"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime18PointerElementTypeE", "tensorrt_llm::runtime::PointerElementType::T"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParamsE", "tensorrt_llm::runtime::PromptTuningParams"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::PromptTuningParams::PromptTuningParams"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::PromptTuningParams::PromptTuningParams::embeddingTable"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::PromptTuningParams::PromptTuningParams::tasks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams18PromptTuningParamsE9TensorPtr9TensorPtr9TensorPtr", "tensorrt_llm::runtime::PromptTuningParams::PromptTuningParams::vocabSize"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams10SizeType32E", "tensorrt_llm::runtime::PromptTuningParams::SizeType32"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams9TensorPtrE", "tensorrt_llm::runtime::PromptTuningParams::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::batchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::manager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::numContextRequests"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::packedInput"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::reqBeamWidths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::reqPromptLengths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime18PromptTuningParams15fillTasksTensorE9TensorPtr10SizeType3210SizeType32RKNSt6vectorI10SizeType32EERKNSt6vectorI10SizeType32EERK13BufferManagerb", "tensorrt_llm::runtime::PromptTuningParams::fillTasksTensor::tasksHost"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngineE", "tensorrt_llm::runtime::RawEngine"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type15AddressWithSizeE", "tensorrt_llm::runtime::RawEngine::AddressWithSize"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type8FilePathE", "tensorrt_llm::runtime::RawEngine::FilePath"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type10HostMemoryE", "tensorrt_llm::runtime::RawEngine::HostMemory"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineENSt10filesystem4pathE", "tensorrt_llm::runtime::RawEngine::RawEngine"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKN8nvinfer111IHostMemoryE", "tensorrt_llm::runtime::RawEngine::RawEngine"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKvNSt6size_tE", "tensorrt_llm::runtime::RawEngine::RawEngine"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKvNSt6size_tE", "tensorrt_llm::runtime::RawEngine::RawEngine::engineAddr"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKN8nvinfer111IHostMemoryE", "tensorrt_llm::runtime::RawEngine::RawEngine::engineBuffer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineENSt10filesystem4pathE", "tensorrt_llm::runtime::RawEngine::RawEngine::enginePath"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine9RawEngineEPKvNSt6size_tE", "tensorrt_llm::runtime::RawEngine::RawEngine::engineSize"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4TypeE", "tensorrt_llm::runtime::RawEngine::Type"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type15AddressWithSizeE", "tensorrt_llm::runtime::RawEngine::Type::AddressWithSize"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type8FilePathE", "tensorrt_llm::runtime::RawEngine::Type::FilePath"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine4Type10HostMemoryE", "tensorrt_llm::runtime::RawEngine::Type::HostMemory"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine10getAddressEv", "tensorrt_llm::runtime::RawEngine::getAddress"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine13getHostMemoryEv", "tensorrt_llm::runtime::RawEngine::getHostMemory"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine23getManagedWeightsMapOptEv", "tensorrt_llm::runtime::RawEngine::getManagedWeightsMapOpt"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getPathEv", "tensorrt_llm::runtime::RawEngine::getPath"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine10getPathOptEv", "tensorrt_llm::runtime::RawEngine::getPathOpt"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getSizeEv", "tensorrt_llm::runtime::RawEngine::getSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime9RawEngine7getTypeEv", "tensorrt_llm::runtime::RawEngine::getType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEngineAddrE", "tensorrt_llm::runtime::RawEngine::mEngineAddr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine13mEngineBufferE", "tensorrt_llm::runtime::RawEngine::mEngineBuffer"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEnginePathE", "tensorrt_llm::runtime::RawEngine::mEnginePath"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine11mEngineSizeE", "tensorrt_llm::runtime::RawEngine::mEngineSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine18mManagedWeightsMapE", "tensorrt_llm::runtime::RawEngine::mManagedWeightsMap"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine5mTypeE", "tensorrt_llm::runtime::RawEngine::mType"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine20setManagedWeightsMapENSt3mapINSt6stringEN12tensorrt_llm8executor6TensorEEE", "tensorrt_llm::runtime::RawEngine::setManagedWeightsMap"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine20setManagedWeightsMapENSt3mapINSt6stringEN12tensorrt_llm8executor6TensorEEE", "tensorrt_llm::runtime::RawEngine::setManagedWeightsMap::managedWeightsMap"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine7setPathENSt10filesystem4pathE", "tensorrt_llm::runtime::RawEngine::setPath"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9RawEngine7setPathENSt10filesystem4pathE", "tensorrt_llm::runtime::RawEngine::setPath::enginePath"], [1, 6, 1, "_CPPv4N12tensorrt_llm7runtime11RequestTypeE", "tensorrt_llm::runtime::RequestType"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11RequestType8kCONTEXTE", "tensorrt_llm::runtime::RequestType::kCONTEXT"], [1, 7, 1, "_CPPv4N12tensorrt_llm7runtime11RequestType11kGENERATIONE", "tensorrt_llm::runtime::RequestType::kGENERATION"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaultsE", "tensorrt_llm::runtime::RuntimeDefaults"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::RuntimeDefaults::RuntimeDefaults"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsEv", "tensorrt_llm::runtime::RuntimeDefaults::RuntimeDefaults"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::RuntimeDefaults::RuntimeDefaults::maxAttentionWindowVec"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15RuntimeDefaultsENSt8optionalINSt6vectorI10SizeType32EEEENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::RuntimeDefaults::RuntimeDefaults::sinkTokenLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults21maxAttentionWindowVecE", "tensorrt_llm::runtime::RuntimeDefaults::maxAttentionWindowVec"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime15RuntimeDefaults15sinkTokenLengthE", "tensorrt_llm::runtime::RuntimeDefaults::sinkTokenLength"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfigE", "tensorrt_llm::runtime::SamplingConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9FloatTypeE", "tensorrt_llm::runtime::SamplingConfig::FloatType"], [1, 1, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig6OptVecE", "tensorrt_llm::runtime::SamplingConfig::OptVec"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig6OptVecE", "tensorrt_llm::runtime::SamplingConfig::OptVec::T"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigE10SizeType32", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKN8executor14SamplingConfigERKNSt8optionalIN8executor25ExternalDraftTokensConfigEEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKNSt6vectorI14SamplingConfigEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigE10SizeType32", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig::beamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKNSt6vectorI14SamplingConfigEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig::configs"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKN8executor14SamplingConfigERKNSt8optionalIN8executor25ExternalDraftTokensConfigEEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig::externalDraftTokensConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14SamplingConfigERKN8executor14SamplingConfigERKNSt8optionalIN8executor25ExternalDraftTokensConfigEEE", "tensorrt_llm::runtime::SamplingConfig::SamplingConfig::samplingConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig23beamSearchDiversityRateE", "tensorrt_llm::runtime::SamplingConfig::beamSearchDiversityRate"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9beamWidthE", "tensorrt_llm::runtime::SamplingConfig::beamWidth"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14beamWidthArrayE", "tensorrt_llm::runtime::SamplingConfig::beamWidthArray"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig11cumLogProbsE", "tensorrt_llm::runtime::SamplingConfig::cumLogProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig24draftAcceptanceThresholdE", "tensorrt_llm::runtime::SamplingConfig::draftAcceptanceThreshold"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig13earlyStoppingE", "tensorrt_llm::runtime::SamplingConfig::earlyStopping"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig16frequencyPenaltyE", "tensorrt_llm::runtime::SamplingConfig::frequencyPenalty"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues::accessor"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues::configs"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig10fuseValuesE6OptVecI1TERKNSt6vectorI14SamplingConfigEENSt8functionIF6OptVecI1TE6size_tEEE1T", "tensorrt_llm::runtime::SamplingConfig::fuseValues::defaultValue"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfig15getMaxBeamWidthEv", "tensorrt_llm::runtime::SamplingConfig::getMaxBeamWidth"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfig17getNumReturnBeamsEv", "tensorrt_llm::runtime::SamplingConfig::getNumReturnBeams"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig13lengthPenaltyE", "tensorrt_llm::runtime::SamplingConfig::lengthPenalty"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9minLengthE", "tensorrt_llm::runtime::SamplingConfig::minLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4minPE", "tensorrt_llm::runtime::SamplingConfig::minP"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17noRepeatNgramSizeE", "tensorrt_llm::runtime::SamplingConfig::noRepeatNgramSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17normalizeLogProbsE", "tensorrt_llm::runtime::SamplingConfig::normalizeLogProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig18numReturnSequencesE", "tensorrt_llm::runtime::SamplingConfig::numReturnSequences"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfigeqERK14SamplingConfig", "tensorrt_llm::runtime::SamplingConfig::operator=="], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime14SamplingConfigeqERK14SamplingConfig", "tensorrt_llm::runtime::SamplingConfig::operator==::other"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig19originalTemperatureE", "tensorrt_llm::runtime::SamplingConfig::originalTemperature"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig14outputLogProbsE", "tensorrt_llm::runtime::SamplingConfig::outputLogProbs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig15presencePenaltyE", "tensorrt_llm::runtime::SamplingConfig::presencePenalty"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig10randomSeedE", "tensorrt_llm::runtime::SamplingConfig::randomSeed"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig17repetitionPenaltyE", "tensorrt_llm::runtime::SamplingConfig::repetitionPenalty"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig11temperatureE", "tensorrt_llm::runtime::SamplingConfig::temperature"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4topKE", "tensorrt_llm::runtime::SamplingConfig::topK"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig15topKMedusaHeadsE", "tensorrt_llm::runtime::SamplingConfig::topKMedusaHeads"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig4topPE", "tensorrt_llm::runtime::SamplingConfig::topP"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig9topPDecayE", "tensorrt_llm::runtime::SamplingConfig::topPDecay"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig7topPMinE", "tensorrt_llm::runtime::SamplingConfig::topPMin"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig12topPResetIdsE", "tensorrt_llm::runtime::SamplingConfig::topPResetIds"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", "tensorrt_llm::runtime::SamplingConfig::useDefaultValues"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", "tensorrt_llm::runtime::SamplingConfig::useDefaultValues::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", "tensorrt_llm::runtime::SamplingConfig::useDefaultValues::defaultValue"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig16useDefaultValuesEbRK6OptVecI1TE1T", "tensorrt_llm::runtime::SamplingConfig::useDefaultValues::vec"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime14SamplingConfig8validateEv", "tensorrt_llm::runtime::SamplingConfig::validate"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::max"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::min"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::name"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime14SamplingConfig11validateVecEbNSt6stringERK6OptVecI1TE1TNSt8optionalI1TEE", "tensorrt_llm::runtime::SamplingConfig::validateVec::vec"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10SizeType32E", "tensorrt_llm::runtime::SizeType32"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime10SizeType64E", "tensorrt_llm::runtime::SizeType64"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingModeE", "tensorrt_llm::runtime::SpeculativeDecodingMode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode19DraftTokensExternalEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::DraftTokensExternal"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode5EagleEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::Eagle"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode19ExplicitDraftTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::ExplicitDraftTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode17LookaheadDecodingEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::LookaheadDecoding"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6MedusaEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::Medusa"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode4NoneEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::None"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode23SpeculativeDecodingModeE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::SpeculativeDecodingMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode23SpeculativeDecodingModeE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::SpeculativeDecodingMode::state"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode14UnderlyingTypeE", "tensorrt_llm::runtime::SpeculativeDecodingMode::UnderlyingType"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9allBitSetE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::allBitSet"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9allBitSetE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::allBitSet::bits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9anyBitSetE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::anyBitSet"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode9anyBitSetE14UnderlyingType", "tensorrt_llm::runtime::SpeculativeDecodingMode::anyBitSet::bits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode14hasDraftLogitsEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::hasDraftLogits"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21isDraftTokensExternalEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isDraftTokensExternal"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode7isEagleEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isEagle"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21isExplicitDraftTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isExplicitDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19isLookaheadDecodingEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isLookaheadDecoding"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode8isMedusaEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isMedusa"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode6isNoneEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::isNone"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode20kDraftTokensExternalE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kDraftTokensExternal"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6kEagleE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kEagle"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode20kExplicitDraftTokensE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kExplicitDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode18kLookaheadDecodingE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kLookaheadDecoding"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode7kMedusaE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kMedusa"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode5kNoneE", "tensorrt_llm::runtime::SpeculativeDecodingMode::kNone"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime23SpeculativeDecodingMode6mStateE", "tensorrt_llm::runtime::SpeculativeDecodingMode::mState"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode20needsDecoderPrologueEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::needsDecoderPrologue"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode18needsKVCacheRewindEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::needsKVCacheRewind"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingModeeqERK23SpeculativeDecodingMode", "tensorrt_llm::runtime::SpeculativeDecodingMode::operator=="], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingModeeqERK23SpeculativeDecodingMode", "tensorrt_llm::runtime::SpeculativeDecodingMode::operator==::other"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19predictsDraftTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::predictsDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode21requiresAttentionMaskEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::requiresAttentionMask"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode18updatesPositionIdsEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::updatesPositionIds"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime23SpeculativeDecodingMode19variableDraftLengthEv", "tensorrt_llm::runtime::SpeculativeDecodingMode::variableDraftLength"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleE", "tensorrt_llm::runtime::SpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleERK25SpeculativeDecodingModule", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule::maxDecodingDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule::maxDraftPathLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleE10SizeType3210SizeType3210SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule::maxNumPaths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule25SpeculativeDecodingModuleERK25SpeculativeDecodingModule", "tensorrt_llm::runtime::SpeculativeDecodingModule::SpeculativeDecodingModule::o"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule21computeNumPackedMasksEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::computeNumPackedMasks"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule25getMaxDecodingDraftTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxDecodingDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule20getMaxDecodingTokensEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxDecodingTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule18getMaxDraftPathLenEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxDraftPathLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule14getMaxNumPathsEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxNumPaths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule13getMaxPathLenEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getMaxPathLen"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime25SpeculativeDecodingModule17getNumPackedMasksEv", "tensorrt_llm::runtime::SpeculativeDecodingModule::getNumPackedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule23mMaxDecodingDraftTokensE", "tensorrt_llm::runtime::SpeculativeDecodingModule::mMaxDecodingDraftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule16mMaxDraftPathLenE", "tensorrt_llm::runtime::SpeculativeDecodingModule::mMaxDraftPathLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18mMaxNumPackedMasksE", "tensorrt_llm::runtime::SpeculativeDecodingModule::mMaxNumPackedMasks"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule12mMaxNumPathsE", "tensorrt_llm::runtime::SpeculativeDecodingModule::mMaxNumPaths"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleaSERK25SpeculativeDecodingModule", "tensorrt_llm::runtime::SpeculativeDecodingModule::operator="], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleaSERK25SpeculativeDecodingModule", "tensorrt_llm::runtime::SpeculativeDecodingModule::operator=::o"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18setMaxDraftPathLenE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxDraftPathLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule18setMaxDraftPathLenE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxDraftPathLen::maxDraftPathLen"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule17setMaxDraftTokensE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxDraftTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule17setMaxDraftTokensE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxDraftTokens::maxDraftTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule14setMaxNumPathsE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxNumPaths"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModule14setMaxNumPathsE10SizeType32", "tensorrt_llm::runtime::SpeculativeDecodingModule::setMaxNumPaths::maxNumPaths"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime25SpeculativeDecodingModuleD0Ev", "tensorrt_llm::runtime::SpeculativeDecodingModule::~SpeculativeDecodingModule"], [1, 1, 1, "_CPPv4I0EN12tensorrt_llm7runtime12StringPtrMapE", "tensorrt_llm::runtime::StringPtrMap"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime12StringPtrMapE", "tensorrt_llm::runtime::StringPtrMap::T"], [1, 2, 1, "_CPPv4I0_bEN12tensorrt_llm7runtime11TRTDataTypeE", "tensorrt_llm::runtime::TRTDataType"], [1, 8, 1, "_CPPv4I0_bEN12tensorrt_llm7runtime11TRTDataTypeE", "tensorrt_llm::runtime::TRTDataType::T"], [1, 2, 1, "_CPPv4I0EN12tensorrt_llm7runtime11TRTDataTypeIP1TEE", "tensorrt_llm::runtime::TRTDataType&lt;T*&gt;"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime11TRTDataTypeIP1TEE", "tensorrt_llm::runtime::TRTDataType&lt;T*&gt;::T"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIP1TE15kUnderlyingTypeE", "tensorrt_llm::runtime::TRTDataType&lt;T*&gt;::kUnderlyingType"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIP1TE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;T*&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIbEE", "tensorrt_llm::runtime::TRTDataType&lt;bool&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIbE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;bool&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIfEE", "tensorrt_llm::runtime::TRTDataType&lt;float&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIfE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;float&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeI4halfEE", "tensorrt_llm::runtime::TRTDataType&lt;half&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeI4halfE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;half&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7kernels13FinishedStateEEE", "tensorrt_llm::runtime::TRTDataType&lt;kernels::FinishedState&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7kernels13FinishedStateEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;kernels::FinishedState&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7kernels12KVCacheIndexEEE", "tensorrt_llm::runtime::TRTDataType&lt;kernels::KVCacheIndex&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7kernels12KVCacheIndexEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;kernels::KVCacheIndex&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIN7runtime11RequestTypeEEE", "tensorrt_llm::runtime::TRTDataType&lt;runtime::RequestType&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIN7runtime11RequestTypeEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;runtime::RequestType&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7int32_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::int32_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7int32_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::int32_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7int64_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::int64_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7int64_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::int64_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt6int8_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::int8_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt6int8_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::int8_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt8uint32_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint32_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt8uint32_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint32_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt8uint64_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint64_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt8uint64_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint64_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeINSt7uint8_tEEE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint8_t&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeINSt7uint8_tEE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;std::uint8_t&gt;::value"], [1, 2, 1, "_CPPv4IEN12tensorrt_llm7runtime11TRTDataTypeIPvEE", "tensorrt_llm::runtime::TRTDataType&lt;void*&gt;"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11TRTDataTypeIPvE5valueE", "tensorrt_llm::runtime::TRTDataType&lt;void*&gt;::value"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLoggerE", "tensorrt_llm::runtime::TllmLogger"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8getLevelEv", "tensorrt_llm::runtime::TllmLogger::getLevel"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger3logE8SeverityPKN8nvinfer19AsciiCharE", "tensorrt_llm::runtime::TllmLogger::log"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger3logE8SeverityPKN8nvinfer19AsciiCharE", "tensorrt_llm::runtime::TllmLogger::log::msg"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger3logE8SeverityPKN8nvinfer19AsciiCharE", "tensorrt_llm::runtime::TllmLogger::log::severity"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8setLevelE8Severity", "tensorrt_llm::runtime::TllmLogger::setLevel"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime10TllmLogger8setLevelE8Severity", "tensorrt_llm::runtime::TllmLogger::setLevel::level"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime16TokenExtraIdTypeE", "tensorrt_llm::runtime::TokenExtraIdType"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime11TokenIdTypeE", "tensorrt_llm::runtime::TokenIdType"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11UniqueTokenE", "tensorrt_llm::runtime::UniqueToken"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11UniqueTokeneqERK11UniqueToken", "tensorrt_llm::runtime::UniqueToken::operator=="], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11UniqueTokeneqERK11UniqueToken", "tensorrt_llm::runtime::UniqueToken::operator==::other"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11UniqueToken12tokenExtraIdE", "tensorrt_llm::runtime::UniqueToken::tokenExtraId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11UniqueToken7tokenIdE", "tensorrt_llm::runtime::UniqueToken::tokenId"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime16VecTokenExtraIdsE", "tensorrt_llm::runtime::VecTokenExtraIds"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime15VecUniqueTokensE", "tensorrt_llm::runtime::VecUniqueTokens"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfigE", "tensorrt_llm::runtime::WorldConfig"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::contextParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::deviceIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::enableAttentionDP"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::gpusPerNode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::pipelineParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::rank"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig11WorldConfigE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::WorldConfig::tensorParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig17enableAttentionDPEv", "tensorrt_llm::runtime::WorldConfig::enableAttentionDP"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig23getContextParallelGroupEv", "tensorrt_llm::runtime::WorldConfig::getContextParallelGroup"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getContextParallelRankEv", "tensorrt_llm::runtime::WorldConfig::getContextParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig21getContextParallelismEv", "tensorrt_llm::runtime::WorldConfig::getContextParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig9getDeviceEv", "tensorrt_llm::runtime::WorldConfig::getDevice"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getDeviceOfE10SizeType32", "tensorrt_llm::runtime::WorldConfig::getDeviceOf"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getDeviceOfE10SizeType32", "tensorrt_llm::runtime::WorldConfig::getDeviceOf::rank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig15getGpusPerGroupEv", "tensorrt_llm::runtime::WorldConfig::getGpusPerGroup"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig14getGpusPerNodeEv", "tensorrt_llm::runtime::WorldConfig::getGpusPerNode"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getLastRankEv", "tensorrt_llm::runtime::WorldConfig::getLastRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig12getLocalRankEv", "tensorrt_llm::runtime::WorldConfig::getLocalRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig11getNodeRankEv", "tensorrt_llm::runtime::WorldConfig::getNodeRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig13getNodeRankOfE10SizeType32", "tensorrt_llm::runtime::WorldConfig::getNodeRankOf"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig13getNodeRankOfE10SizeType32", "tensorrt_llm::runtime::WorldConfig::getNodeRankOf::rank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig24getPipelineParallelGroupEv", "tensorrt_llm::runtime::WorldConfig::getPipelineParallelGroup"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig23getPipelineParallelRankEv", "tensorrt_llm::runtime::WorldConfig::getPipelineParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getPipelineParallelismEv", "tensorrt_llm::runtime::WorldConfig::getPipelineParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig7getRankEv", "tensorrt_llm::runtime::WorldConfig::getRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig7getSizeEv", "tensorrt_llm::runtime::WorldConfig::getSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig22getTensorParallelGroupEv", "tensorrt_llm::runtime::WorldConfig::getTensorParallelGroup"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig21getTensorParallelRankEv", "tensorrt_llm::runtime::WorldConfig::getTensorParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig20getTensorParallelismEv", "tensorrt_llm::runtime::WorldConfig::getTensorParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig17isContextParallelEv", "tensorrt_llm::runtime::WorldConfig::isContextParallel"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig26isFirstContextParallelRankEv", "tensorrt_llm::runtime::WorldConfig::isFirstContextParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig27isFirstPipelineParallelRankEv", "tensorrt_llm::runtime::WorldConfig::isFirstPipelineParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig25isFirstTensorParallelRankEv", "tensorrt_llm::runtime::WorldConfig::isFirstTensorParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig26isLastPipelineParallelRankEv", "tensorrt_llm::runtime::WorldConfig::isLastPipelineParallelRank"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig18isPipelineParallelEv", "tensorrt_llm::runtime::WorldConfig::isPipelineParallel"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig16isTensorParallelEv", "tensorrt_llm::runtime::WorldConfig::isTensorParallel"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig19kDefaultGpusPerNodeE", "tensorrt_llm::runtime::WorldConfig::kDefaultGpusPerNode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig19mContextParallelismE", "tensorrt_llm::runtime::WorldConfig::mContextParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig10mDeviceIdsE", "tensorrt_llm::runtime::WorldConfig::mDeviceIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig18mEnableAttentionDPE", "tensorrt_llm::runtime::WorldConfig::mEnableAttentionDP"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig12mGpusPerNodeE", "tensorrt_llm::runtime::WorldConfig::mGpusPerNode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig20mPipelineParallelismE", "tensorrt_llm::runtime::WorldConfig::mPipelineParallelism"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig5mRankE", "tensorrt_llm::runtime::WorldConfig::mRank"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig18mTensorParallelismE", "tensorrt_llm::runtime::WorldConfig::mTensorParallelism"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::contextParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::deviceIds"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::enableAttentionDP"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::gpusPerNode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::pipelineParallelism"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11WorldConfig3mpiE10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EENSt8optionalI10SizeType32EERKNSt8optionalINSt6vectorI10SizeType32EEEEb", "tensorrt_llm::runtime::WorldConfig::mpi::tensorParallelism"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime11WorldConfig14validMpiConfigEv", "tensorrt_llm::runtime::WorldConfig::validMpiConfig"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEP1TR7IBuffer", "tensorrt_llm::runtime::bufferCast"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEPK1TRK7IBuffer", "tensorrt_llm::runtime::bufferCast"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEP1TR7IBuffer", "tensorrt_llm::runtime::bufferCast::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEPK1TRK7IBuffer", "tensorrt_llm::runtime::bufferCast::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEP1TR7IBuffer", "tensorrt_llm::runtime::bufferCast::buffer"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime10bufferCastEPK1TRK7IBuffer", "tensorrt_llm::runtime::bufferCast::buffer"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7IBuffer9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7ITensor9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7IBuffer9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7ITensor9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7IBuffer14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7ITensor14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7IBuffer14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7ITensor14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7IBuffer9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7ITensor9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7IBuffer9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7ITensor9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7IBuffer14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7ITensor14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7IBuffer14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7ITensor14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::T"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7IBuffer9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull::bufferPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7IBuffer14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull::bufferPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7IBuffer9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::optionalBufferPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7IBuffer14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::optionalBufferPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKNSt8optionalIN7ITensor9SharedPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::optionalTensorPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKNSt8optionalIN7ITensor14SharedConstPtrEEE", "tensorrt_llm::runtime::bufferCastOrNull::optionalTensorPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEP1TRKN7ITensor9SharedPtrE", "tensorrt_llm::runtime::bufferCastOrNull::tensorPtr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16bufferCastOrNullEPK1TRKN7ITensor14SharedConstPtrE", "tensorrt_llm::runtime::bufferCastOrNull::tensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13canAccessPeerERK11WorldConfig", "tensorrt_llm::runtime::canAccessPeer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13canAccessPeerERK11WorldConfig", "tensorrt_llm::runtime::canAccessPeer::worldConfig"], [1, 3, 1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", "tensorrt_llm::runtime::constPointerCast"], [1, 3, 1, "_CPPv4I0EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERKNSt10shared_ptrI1TEE", "tensorrt_llm::runtime::constPointerCast"], [1, 8, 1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", "tensorrt_llm::runtime::constPointerCast::D"], [1, 8, 1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", "tensorrt_llm::runtime::constPointerCast::T"], [1, 8, 1, "_CPPv4I0EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERKNSt10shared_ptrI1TEE", "tensorrt_llm::runtime::constPointerCast::T"], [1, 4, 1, "_CPPv4I00EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERRNSt10unique_ptrI1T1DEE", "tensorrt_llm::runtime::constPointerCast::ptr"], [1, 4, 1, "_CPPv4I0EN12tensorrt_llm7runtime16constPointerCastENSt10shared_ptrINSt14remove_const_tI1TEEEERKNSt10shared_ptrI1TEE", "tensorrt_llm::runtime::constPointerCast::ptr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoderE", "tensorrt_llm::runtime::decoder"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoderE", "tensorrt_llm::runtime::decoder"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffersE", "tensorrt_llm::runtime::decoder::BeamSearchBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers17BeamSearchBuffersERK13BufferManager", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::BeamSearchBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers17BeamSearchBuffersERK13BufferManager", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::BeamSearchBuffers::bufferManager"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers15mCumLogProbsTmpE", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::mCumLogProbsTmp"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7mNumSMsE", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::mNumSMs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers21mOutputBeamHypothesesE", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::mOutputBeamHypotheses"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7reshapeE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::reshape"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7reshapeE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::reshape::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder17BeamSearchBuffers7reshapeE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::BeamSearchBuffers::reshape::maxSequenceLength"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderStateE", "tensorrt_llm::runtime::decoder::DecoderState"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState12DecoderStateEN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::DecoderState"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState12DecoderStateEN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::DecoderState::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState12DecoderStateEN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::DecoderState::dtype"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16DecodingInputPtrE", "tensorrt_llm::runtime::decoder::DecoderState::DecodingInputPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState17DecodingOutputPtrE", "tensorrt_llm::runtime::decoder::DecoderState::DecodingOutputPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13LlmRequestPtrE", "tensorrt_llm::runtime::decoder::DecoderState::LlmRequestPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13RequestVectorE", "tensorrt_llm::runtime::decoder::DecoderState::RequestVector"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState9TensorPtrE", "tensorrt_llm::runtime::decoder::DecoderState::TensorPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::allocateSpeculativeDecodingBuffers"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::allocateSpeculativeDecodingBuffers::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::allocateSpeculativeDecodingBuffers::dtype"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState34allocateSpeculativeDecodingBuffersE23SpeculativeDecodingModeN8nvinfer18DataTypeERK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::allocateSpeculativeDecodingBuffers::speculativeDecodingMode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16disableLookaheadERK13RequestVector", "tensorrt_llm::runtime::decoder::DecoderState::disableLookahead"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState16disableLookaheadERK13RequestVector", "tensorrt_llm::runtime::decoder::DecoderState::disableLookahead::genRequests"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24getAcceptedLengthsCumSumEv", "tensorrt_llm::runtime::decoder::DecoderState::getAcceptedLengthsCumSum"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getAcceptedPackedPathsEv", "tensorrt_llm::runtime::decoder::DecoderState::getAcceptedPackedPaths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getAllNewTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getAllNewTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getBeamSearchBuffersEv", "tensorrt_llm::runtime::decoder::DecoderState::getBeamSearchBuffers"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getCumLogProbs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsEv", "tensorrt_llm::runtime::decoder::DecoderState::getCumLogProbs"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getCumLogProbsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getCumLogProbs::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishReasonsEv", "tensorrt_llm::runtime::decoder::DecoderState::getFinishReasons"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState16getFinishedStepsEv", "tensorrt_llm::runtime::decoder::DecoderState::getFinishedSteps"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getFinishedSumEv", "tensorrt_llm::runtime::decoder::DecoderState::getFinishedSum"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getGatheredIds"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsEv", "tensorrt_llm::runtime::decoder::DecoderState::getGatheredIds"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14getGatheredIdsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getGatheredIds::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getIds"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsEv", "tensorrt_llm::runtime::decoder::DecoderState::getIds"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState6getIdsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getIds::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState21getJointDecodingInputEv", "tensorrt_llm::runtime::decoder::DecoderState::getJointDecodingInput"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState22getJointDecodingOutputEv", "tensorrt_llm::runtime::decoder::DecoderState::getJointDecodingOutput"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getLogProbs"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsEv", "tensorrt_llm::runtime::decoder::DecoderState::getLogProbs"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState11getLogProbsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getLogProbs::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBatchSizeEv", "tensorrt_llm::runtime::decoder::DecoderState::getMaxBatchSize"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState15getMaxBeamWidthEv", "tensorrt_llm::runtime::decoder::DecoderState::getMaxBeamWidth"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState27getMaxDecodingDecoderTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getMaxDecodingDecoderTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getMaxDecodingEngineTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getMaxDecodingEngineTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState20getMaxSequenceLengthEv", "tensorrt_llm::runtime::decoder::DecoderState::getMaxSequenceLength"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getNextDraftTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getNextDraftTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getNextDraftTokensLengthsEv", "tensorrt_llm::runtime::decoder::DecoderState::getNextDraftTokensLengths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getNumDecodingEngineTokens"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensEv", "tensorrt_llm::runtime::decoder::DecoderState::getNumDecodingEngineTokens"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getNumDecodingEngineTokensE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getNumDecodingEngineTokens::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState12getParentIdsEv", "tensorrt_llm::runtime::decoder::DecoderState::getParentIds"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState25getPrevDraftTokensLengthsEv", "tensorrt_llm::runtime::decoder::DecoderState::getPrevDraftTokensLengths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getSequenceLengths"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsEv", "tensorrt_llm::runtime::decoder::DecoderState::getSequenceLengths"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState18getSequenceLengthsE10SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::getSequenceLengths::batchIdx"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState26getSpeculativeDecodingModeEv", "tensorrt_llm::runtime::decoder::DecoderState::getSpeculativeDecodingMode"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState18mBeamSearchBuffersE", "tensorrt_llm::runtime::decoder::DecoderState::mBeamSearchBuffers"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState14mFinishedStepsE", "tensorrt_llm::runtime::decoder::DecoderState::mFinishedSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState19mJointDecodingInputE", "tensorrt_llm::runtime::decoder::DecoderState::mJointDecodingInput"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState20mJointDecodingOutputE", "tensorrt_llm::runtime::decoder::DecoderState::mJointDecodingOutput"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13mMaxBatchSizeE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxBatchSize"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState13mMaxBeamWidthE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxBeamWidth"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState25mMaxDecodingDecoderTokensE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxDecodingDecoderTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mMaxDecodingEngineTokensE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxDecodingEngineTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState18mMaxSequenceLengthE", "tensorrt_llm::runtime::decoder::DecoderState::mMaxSequenceLength"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mNumDecodingEngineTokensE", "tensorrt_llm::runtime::decoder::DecoderState::mNumDecodingEngineTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24mSpeculativeDecodingModeE", "tensorrt_llm::runtime::decoder::DecoderState::mSpeculativeDecodingMode"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState26setNumDecodingEngineTokensE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::setNumDecodingEngineTokens"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState26setNumDecodingEngineTokensE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::setNumDecodingEngineTokens::batchIdx"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState26setNumDecodingEngineTokensE10SizeType3210SizeType32", "tensorrt_llm::runtime::decoder::DecoderState::setNumDecodingEngineTokens::numTokens"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::maxAttentionWindow"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::maxBatchSize"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::maxBeamWidth"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::maxSequenceLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::sinkTokenLength"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState5setupE10SizeType3210SizeType3210SizeType3210SizeType3210SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setup::worldConfig"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState10setupEagleEN12EagleBuffers6InputsE", "tensorrt_llm::runtime::decoder::DecoderState::setupEagle"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState10setupEagleEN12EagleBuffers6InputsE", "tensorrt_llm::runtime::decoder::DecoderState::setupEagle::eagleBuffers"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24setupExplicitDraftTokensEN26ExplicitDraftTokensBuffers6InputsE", "tensorrt_llm::runtime::decoder::DecoderState::setupExplicitDraftTokens"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState24setupExplicitDraftTokensEN26ExplicitDraftTokensBuffers6InputsE", "tensorrt_llm::runtime::decoder::DecoderState::setupExplicitDraftTokens::explicitDraftTokensBuffers"], [1, 3, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14setupLookaheadE24LookaheadDecodingBuffers", "tensorrt_llm::runtime::decoder::DecoderState::setupLookahead"], [1, 4, 1, "_CPPv4NK12tensorrt_llm7runtime7decoder12DecoderState14setupLookaheadE24LookaheadDecodingBuffers", "tensorrt_llm::runtime::decoder::DecoderState::setupLookahead::lookaheadDecodingBuffers"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::bufferManager"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::maxTokensPerEngineStep"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::modelConfig"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::speculativeDecodingMode"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime7decoder12DecoderState24setupSpeculativeDecodingERK23SpeculativeDecodingMode10SizeType32RK11ModelConfigRK11WorldConfigRK13BufferManager", "tensorrt_llm::runtime::decoder::DecoderState::setupSpeculativeDecoding::worldConfig"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batchE", "tensorrt_llm::runtime::decoder_batch"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batchE", "tensorrt_llm::runtime::decoder_batch"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5InputE", "tensorrt_llm::runtime::decoder_batch::Input"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorI14TensorConstPtrEE", "tensorrt_llm::runtime::decoder_batch::Input::Input"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorINSt6vectorI14TensorConstPtrEEEE10SizeType32", "tensorrt_llm::runtime::decoder_batch::Input::Input"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorI14TensorConstPtrEE", "tensorrt_llm::runtime::decoder_batch::Input::Input::logits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorINSt6vectorI14TensorConstPtrEEEE10SizeType32", "tensorrt_llm::runtime::decoder_batch::Input::Input::logits"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input5InputERKNSt6vectorINSt6vectorI14TensorConstPtrEEEE10SizeType32", "tensorrt_llm::runtime::decoder_batch::Input::Input::maxDecoderSteps"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input14TensorConstPtrE", "tensorrt_llm::runtime::decoder_batch::Input::TensorConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input9TensorPtrE", "tensorrt_llm::runtime::decoder_batch::Input::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input10batchSlotsE", "tensorrt_llm::runtime::decoder_batch::Input::batchSlots"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input22batchSlotsRequestOrderE", "tensorrt_llm::runtime::decoder_batch::Input::batchSlotsRequestOrder"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input16cacheIndirectionE", "tensorrt_llm::runtime::decoder_batch::Input::cacheIndirection"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input11eagleInputsE", "tensorrt_llm::runtime::decoder_batch::Input::eagleInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15eagleLastInputsE", "tensorrt_llm::runtime::decoder_batch::Input::eagleLastInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input25explicitDraftTokensInputsE", "tensorrt_llm::runtime::decoder_batch::Input::explicitDraftTokensInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input29explicitDraftTokensLastInputsE", "tensorrt_llm::runtime::decoder_batch::Input::explicitDraftTokensLastInputs"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15generationStepsE", "tensorrt_llm::runtime::decoder_batch::Input::generationSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input6logitsE", "tensorrt_llm::runtime::decoder_batch::Input::logits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input15maxDecoderStepsE", "tensorrt_llm::runtime::decoder_batch::Input::maxDecoderSteps"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch5Input20predictedDraftLogitsE", "tensorrt_llm::runtime::decoder_batch::Input::predictedDraftLogits"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6OutputE", "tensorrt_llm::runtime::decoder_batch::Output"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output6OutputEv", "tensorrt_llm::runtime::decoder_batch::Output::Output"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output9TensorPtrE", "tensorrt_llm::runtime::decoder_batch::Output::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch6Output16cacheIndirectionE", "tensorrt_llm::runtime::decoder_batch::Output::cacheIndirection"], [1, 2, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7RequestE", "tensorrt_llm::runtime::decoder_batch::Request"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request9BufferPtrE", "tensorrt_llm::runtime::decoder_batch::Request::BufferPtr"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request::endId"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request::ids"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request::inputLen"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request7RequestE14TensorConstPtr10SizeType32NSt8optionalI10SizeType32EENSt8optionalI10SizeType32EE", "tensorrt_llm::runtime::decoder_batch::Request::Request::maxNewTokens"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request14TensorConstPtrE", "tensorrt_llm::runtime::decoder_batch::Request::TensorConstPtr"], [1, 1, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request9TensorPtrE", "tensorrt_llm::runtime::decoder_batch::Request::TensorPtr"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request12badWordsListE", "tensorrt_llm::runtime::decoder_batch::Request::badWordsList"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11draftLogitsE", "tensorrt_llm::runtime::decoder_batch::Request::draftLogits"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11draftTokensE", "tensorrt_llm::runtime::decoder_batch::Request::draftTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request5dtypeE", "tensorrt_llm::runtime::decoder_batch::Request::dtype"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11eagleConfigE", "tensorrt_llm::runtime::decoder_batch::Request::eagleConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13embeddingBiasE", "tensorrt_llm::runtime::decoder_batch::Request::embeddingBias"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request5endIdE", "tensorrt_llm::runtime::decoder_batch::Request::endId"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request28generatedTokensPerEngineStepE", "tensorrt_llm::runtime::decoder_batch::Request::generatedTokensPerEngineStep"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request3idsE", "tensorrt_llm::runtime::decoder_batch::Request::ids"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request8inputLenE", "tensorrt_llm::runtime::decoder_batch::Request::inputLen"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request22lookaheadRuntimeConfigE", "tensorrt_llm::runtime::decoder_batch::Request::lookaheadRuntimeConfig"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request12maxNewTokensE", "tensorrt_llm::runtime::decoder_batch::Request::maxNewTokens"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request11medusaPathsE", "tensorrt_llm::runtime::decoder_batch::Request::medusaPaths"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13medusaTreeIdsE", "tensorrt_llm::runtime::decoder_batch::Request::medusaTreeIds"], [1, 5, 1, "_CPPv4N12tensorrt_llm7runtime13decoder_batch7Request13stopWordsListE", "tensorrt_llm::runtime::decoder_batch::Request::stopWordsList"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20getDefaultBatchSlotsEN7runtime10SizeType32E", "tensorrt_llm::runtime::getDefaultBatchSlots"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20getDefaultBatchSlotsEN7runtime10SizeType32E", "tensorrt_llm::runtime::getDefaultBatchSlots::batchSize"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime15ipcNvlsAllocateE6size_tNSt3setIiEE", "tensorrt_llm::runtime::ipcNvlsAllocate"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15ipcNvlsAllocateE6size_tNSt3setIiEE", "tensorrt_llm::runtime::ipcNvlsAllocate::ranks"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime15ipcNvlsAllocateE6size_tNSt3setIiEE", "tensorrt_llm::runtime::ipcNvlsAllocate::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime11ipcNvlsFreeEP13IpcNvlsHandle", "tensorrt_llm::runtime::ipcNvlsFree"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime11ipcNvlsFreeEP13IpcNvlsHandle", "tensorrt_llm::runtime::ipcNvlsFree::handle"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime16ipcNvlsSupportedEv", "tensorrt_llm::runtime::ipcNvlsSupported"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll::buffer_0"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll::buffer_1"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll::buffer_2"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime20lamportInitializeAllEPvPvPv6size_t", "tensorrt_llm::runtime::lamportInitializeAll::size"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK10LoraModule", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7IBuffer", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7ITensor", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN7ITensor5ShapeE", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::operator&lt;&lt;"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7IBuffer", "tensorrt_llm::runtime::operator&lt;&lt;::buffer"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::operator&lt;&lt;::c"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN7ITensor5ShapeE", "tensorrt_llm::runtime::operator&lt;&lt;::dims"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK10LoraModule", "tensorrt_llm::runtime::operator&lt;&lt;::module"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::operator&lt;&lt;::os"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::operator&lt;&lt;::os"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK10LoraModule", "tensorrt_llm::runtime::operator&lt;&lt;::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7IBuffer", "tensorrt_llm::runtime::operator&lt;&lt;::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7ITensor", "tensorrt_llm::runtime::operator&lt;&lt;::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN7ITensor5ShapeE", "tensorrt_llm::runtime::operator&lt;&lt;::output"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERK7ITensor", "tensorrt_llm::runtime::operator&lt;&lt;::tensor"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtimelsERNSt7ostreamERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::operator&lt;&lt;::v"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9to_stringERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::to_string"], [1, 3, 1, "_CPPv4N12tensorrt_llm7runtime9to_stringERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::to_string"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9to_stringERK26LoraCachePageManagerConfig", "tensorrt_llm::runtime::to_string::c"], [1, 4, 1, "_CPPv4N12tensorrt_llm7runtime9to_stringERKN9LoraCache21TaskLayerModuleConfigE", "tensorrt_llm::runtime::to_string::v"], [87, 9, 0, "-", "tensorrt_llm"]], "tensorrt_llm": [[82, 9, 0, "-", "functional"], [84, 9, 0, "-", "models"], [85, 9, 0, "-", "plugin"], [86, 9, 0, "-", "quantization"], [87, 9, 0, "-", "runtime"]], "tensorrt_llm.functional": [[82, 10, 1, "", "AllReduceFusionOp"], [82, 10, 1, "", "AllReduceParams"], [82, 10, 1, "", "AllReduceStrategy"], [82, 10, 1, "", "AttentionMaskType"], [82, 10, 1, "", "Conditional"], [82, 10, 1, "", "DimRange"], [82, 10, 1, "", "LayerNormPositionType"], [82, 10, 1, "", "LayerNormType"], [82, 10, 1, "", "MLPType"], [82, 10, 1, "", "PositionEmbeddingType"], [82, 10, 1, "", "RopeEmbeddingUtils"], [82, 10, 1, "", "RotaryScalingType"], [82, 10, 1, "", "SideStreamIDType"], [82, 10, 1, "", "SliceInputType"], [82, 10, 1, "", "Tensor"], [82, 14, 1, "", "abs"], [82, 14, 1, "", "activation"], [82, 14, 1, "", "add"], [82, 14, 1, "", "allgather"], [82, 14, 1, "", "allreduce"], [82, 14, 1, "", "arange"], [82, 14, 1, "", "argmax"], [82, 14, 1, "", "assertion"], [82, 14, 1, "", "avg_pool2d"], [82, 14, 1, "", "bert_attention"], [82, 14, 1, "", "broadcast_helper"], [82, 14, 1, "", "cast"], [82, 14, 1, "", "categorical_sample"], [82, 14, 1, "", "chunk"], [82, 14, 1, "", "clip"], [82, 14, 1, "", "concat"], [82, 14, 1, "", "constant"], [82, 14, 1, "", "constant_to_tensor_"], [82, 14, 1, "", "constants_to_tensors_"], [82, 14, 1, "", "conv1d"], [82, 14, 1, "", "conv2d"], [82, 14, 1, "", "conv3d"], [82, 14, 1, "", "conv_transpose2d"], [82, 14, 1, "", "cos"], [82, 14, 1, "", "cp_split_plugin"], [82, 14, 1, "", "create_allreduce_plugin"], [82, 14, 1, "", "cuda_stream_sync"], [82, 14, 1, "", "cumsum"], [82, 14, 1, "", "div"], [82, 14, 1, "", "dora_plugin"], [82, 14, 1, "", "einsum"], [82, 14, 1, "", "elementwise_binary"], [82, 14, 1, "", "embedding"], [82, 14, 1, "", "eq"], [82, 14, 1, "", "exp"], [82, 14, 1, "", "expand"], [82, 14, 1, "", "expand_dims"], [82, 14, 1, "", "expand_dims_like"], [82, 14, 1, "", "expand_mask"], [82, 14, 1, "", "flatten"], [82, 14, 1, "", "flip"], [82, 14, 1, "", "floordiv"], [82, 14, 1, "", "gather"], [82, 14, 1, "", "gather_last_token_logits"], [82, 14, 1, "", "gather_nd"], [82, 14, 1, "", "gegelu"], [82, 14, 1, "", "geglu"], [82, 14, 1, "", "gelu"], [82, 14, 1, "", "gemm_allreduce"], [82, 14, 1, "", "gemm_swiglu"], [82, 14, 1, "", "generate_alibi_biases"], [82, 14, 1, "", "generate_alibi_slopes"], [82, 14, 1, "", "generate_logn_scaling"], [82, 14, 1, "", "gpt_attention"], [82, 14, 1, "", "group_norm"], [82, 14, 1, "", "gt"], [82, 14, 1, "", "identity"], [82, 14, 1, "", "index_select"], [82, 14, 1, "", "int_clip"], [82, 14, 1, "", "interpolate"], [82, 14, 1, "", "is_gated_activation"], [82, 14, 1, "", "layer_norm"], [82, 14, 1, "", "log"], [82, 14, 1, "", "log_softmax"], [82, 14, 1, "", "lora_plugin"], [82, 14, 1, "", "low_latency_gemm"], [82, 14, 1, "", "low_latency_gemm_swiglu"], [82, 14, 1, "", "lt"], [82, 14, 1, "", "mamba_conv1d"], [82, 14, 1, "", "masked_scatter"], [82, 14, 1, "", "masked_select"], [82, 14, 1, "", "matmul"], [82, 14, 1, "", "max"], [82, 14, 1, "", "maximum"], [82, 14, 1, "", "mean"], [82, 14, 1, "", "meshgrid2d"], [82, 14, 1, "", "min"], [82, 14, 1, "", "minimum"], [82, 14, 1, "", "modulo"], [82, 14, 1, "", "mul"], [82, 14, 1, "", "non_gated_version"], [82, 14, 1, "", "nonzero"], [82, 14, 1, "", "not_op"], [82, 14, 1, "", "op_and"], [82, 14, 1, "", "op_or"], [82, 14, 1, "", "op_xor"], [82, 14, 1, "", "outer"], [82, 14, 1, "", "pad"], [82, 14, 1, "", "permute"], [82, 14, 1, "", "pow"], [82, 14, 1, "", "prod"], [82, 14, 1, "", "quick_gelu"], [82, 14, 1, "", "rand"], [82, 14, 1, "", "rearrange"], [82, 14, 1, "", "recv"], [82, 14, 1, "", "reduce"], [82, 14, 1, "", "reduce_scatter"], [82, 14, 1, "", "relu"], [82, 14, 1, "", "repeat"], [82, 14, 1, "", "repeat_interleave"], [82, 14, 1, "", "rg_lru"], [82, 14, 1, "", "rms_norm"], [82, 14, 1, "", "round"], [82, 14, 1, "", "scatter"], [82, 14, 1, "", "scatter_nd"], [82, 14, 1, "", "select"], [82, 14, 1, "", "selective_scan"], [82, 14, 1, "", "send"], [82, 14, 1, "", "shape"], [82, 14, 1, "", "sigmoid"], [82, 14, 1, "", "silu"], [82, 14, 1, "", "sin"], [82, 14, 1, "", "slice"], [82, 14, 1, "", "softmax"], [82, 14, 1, "", "softplus"], [82, 14, 1, "", "split"], [82, 14, 1, "", "sqrt"], [82, 14, 1, "", "squared_relu"], [82, 14, 1, "", "squeeze"], [82, 14, 1, "", "stack"], [82, 14, 1, "", "sub"], [82, 14, 1, "", "sum"], [82, 14, 1, "", "swiglu"], [82, 14, 1, "", "tanh"], [82, 14, 1, "", "topk"], [82, 14, 1, "", "transpose"], [82, 14, 1, "", "unary"], [82, 14, 1, "", "unbind"], [82, 14, 1, "", "unsqueeze"], [82, 14, 1, "", "view"], [82, 14, 1, "", "where"]], "tensorrt_llm.functional.AllReduceFusionOp": [[82, 11, 1, "", "LAST_PROCESS_FOR_UB"], [82, 11, 1, "", "MOE_ALLREDUCE_RESIDUAL_RMS_NORM"], [82, 11, 1, "", "NONE"], [82, 11, 1, "", "RESIDUAL_RMS_NORM"], [82, 11, 1, "", "RESIDUAL_RMS_NORM_OUT_QUANT_FP8"], [82, 11, 1, "", "RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4"], [82, 11, 1, "", "RESIDUAL_RMS_NORM_QUANT_FP8"], [82, 11, 1, "", "RESIDUAL_RMS_NORM_QUANT_NVFP4"], [82, 11, 1, "", "RESIDUAL_RMS_PREPOST_NORM"]], "tensorrt_llm.functional.AllReduceParams": [[82, 12, 1, "", "has_affine"], [82, 12, 1, "", "has_bias"], [82, 12, 1, "", "has_scale"], [82, 12, 1, "", "update_strategy"]], "tensorrt_llm.functional.AllReduceStrategy": [[82, 11, 1, "", "AUTO"], [82, 11, 1, "", "LOWPRECISION"], [82, 11, 1, "", "MIN_LATENCY"], [82, 11, 1, "", "NCCL"], [82, 11, 1, "", "ONESHOT"], [82, 11, 1, "", "TWOSHOT"], [82, 11, 1, "", "UB"]], "tensorrt_llm.functional.AttentionMaskType": [[82, 11, 1, "", "bidirectional"], [82, 11, 1, "", "bidirectionalglm"], [82, 11, 1, "", "blocksparse"], [82, 11, 1, "", "causal"], [82, 11, 1, "", "custom_mask"], [82, 11, 1, "", "padding"], [82, 11, 1, "", "sliding_window_causal"]], "tensorrt_llm.functional.Conditional": [[82, 12, 1, "", "add_input"], [82, 12, 1, "", "add_output"]], "tensorrt_llm.functional.LayerNormPositionType": [[82, 11, 1, "", "post_layernorm"], [82, 11, 1, "", "pre_layernorm"]], "tensorrt_llm.functional.LayerNormType": [[82, 11, 1, "", "GroupNorm"], [82, 11, 1, "", "LayerNorm"], [82, 11, 1, "", "RmsNorm"]], "tensorrt_llm.functional.MLPType": [[82, 11, 1, "", "FusedGatedMLP"], [82, 11, 1, "", "GatedMLP"], [82, 11, 1, "", "MLP"]], "tensorrt_llm.functional.PositionEmbeddingType": [[82, 11, 1, "", "alibi"], [82, 11, 1, "", "alibi_with_scale"], [82, 11, 1, "", "chatglm"], [82, 12, 1, "", "choices"], [82, 11, 1, "", "deferred"], [82, 12, 1, "", "from_string"], [82, 12, 1, "", "is_alibi"], [82, 12, 1, "", "is_deferred"], [82, 12, 1, "", "is_mrope"], [82, 12, 1, "", "is_rope"], [82, 11, 1, "", "learned_absolute"], [82, 11, 1, "", "long_rope"], [82, 11, 1, "", "mrope"], [82, 11, 1, "", "relative"], [82, 11, 1, "", "rope_gpt_neox"], [82, 11, 1, "", "rope_gptj"], [82, 11, 1, "", "yarn"]], "tensorrt_llm.functional.RopeEmbeddingUtils": [[82, 12, 1, "", "apply_llama3_scaling"], [82, 12, 1, "", "apply_rotary_pos_emb"], [82, 12, 1, "", "apply_rotary_pos_emb_chatglm"], [82, 12, 1, "", "apply_rotary_pos_emb_cogvlm"], [82, 12, 1, "", "create_fake_weight"], [82, 12, 1, "", "create_sinusoidal_positions"], [82, 12, 1, "", "create_sinusoidal_positions_for_attention_plugin"], [82, 12, 1, "", "create_sinusoidal_positions_for_cogvlm_attention_plugin"], [82, 12, 1, "", "create_sinusoidal_positions_long_rope"], [82, 12, 1, "", "create_sinusoidal_positions_yarn"], [82, 12, 1, "", "rotate_every_two"], [82, 12, 1, "", "rotate_half"]], "tensorrt_llm.functional.RotaryScalingType": [[82, 11, 1, "", "dynamic"], [82, 12, 1, "", "from_string"], [82, 11, 1, "", "linear"], [82, 11, 1, "", "llama3"], [82, 11, 1, "", "longrope"], [82, 11, 1, "", "mrope"], [82, 11, 1, "", "none"], [82, 11, 1, "", "yarn"]], "tensorrt_llm.functional.SideStreamIDType": [[82, 11, 1, "", "disable"], [82, 11, 1, "", "moe"]], "tensorrt_llm.functional.SliceInputType": [[82, 11, 1, "", "axes"], [82, 11, 1, "", "data"], [82, 11, 1, "", "fill_value"], [82, 11, 1, "", "size"], [82, 11, 1, "", "start"], [82, 11, 1, "", "stride"]], "tensorrt_llm.functional.Tensor": [[82, 12, 1, "", "abs"], [82, 12, 1, "", "cast"], [82, 13, 1, "", "dtype"], [82, 12, 1, "", "flatten"], [82, 12, 1, "", "get_parent"], [82, 12, 1, "", "get_users"], [82, 12, 1, "", "is_dynamic"], [82, 12, 1, "", "is_trt_wrapper"], [82, 13, 1, "", "location"], [82, 12, 1, "", "log"], [82, 12, 1, "", "mark_output"], [82, 12, 1, "", "max"], [82, 12, 1, "", "mean"], [82, 13, 1, "", "name"], [82, 12, 1, "", "ndim"], [82, 13, 1, "", "network"], [82, 12, 1, "", "permute"], [82, 12, 1, "", "rank"], [82, 12, 1, "", "repeat"], [82, 12, 1, "", "replace_all_uses_with"], [82, 12, 1, "", "select"], [82, 13, 1, "", "shape"], [82, 12, 1, "", "size"], [82, 12, 1, "", "split"], [82, 12, 1, "", "sqrt"], [82, 12, 1, "", "squeeze"], [82, 12, 1, "", "transpose"], [82, 12, 1, "", "unbind"], [82, 12, 1, "", "unsqueeze"], [82, 12, 1, "", "view"]], "tensorrt_llm.layers": [[83, 9, 0, "-", "activation"], [83, 9, 0, "-", "attention"], [83, 9, 0, "-", "cast"], [83, 9, 0, "-", "conv"], [83, 9, 0, "-", "embedding"], [83, 9, 0, "-", "linear"], [83, 9, 0, "-", "mlp"], [83, 9, 0, "-", "normalization"], [83, 9, 0, "-", "pooling"]], "tensorrt_llm.layers.activation": [[83, 10, 1, "", "Mish"]], "tensorrt_llm.layers.activation.Mish": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.attention": [[83, 10, 1, "", "Attention"], [83, 10, 1, "", "AttentionMaskParams"], [83, 10, 1, "", "AttentionParams"], [83, 10, 1, "", "BertAttention"], [83, 10, 1, "", "BlockSparseAttnParams"], [83, 10, 1, "", "CogVLMAttention"], [83, 10, 1, "", "DeepseekV2Attention"], [83, 10, 1, "", "DiffusersAttention"], [83, 10, 1, "", "KeyValueCacheParams"], [83, 10, 1, "", "MropeParams"], [83, 10, 1, "", "SpecDecodingParams"], [83, 14, 1, "", "compute_relative_bias"], [83, 14, 1, "", "make_causal_mask"]], "tensorrt_llm.layers.attention.Attention": [[83, 12, 1, "", "create_attention_const_params"], [83, 12, 1, "", "fill_attention_params"], [83, 12, 1, "", "forward"], [83, 12, 1, "", "postprocess"], [83, 12, 1, "", "set_rel_attn_table"]], "tensorrt_llm.layers.attention.AttentionParams": [[83, 12, 1, "", "fill_attention_const_params_for_long_rope"], [83, 12, 1, "", "fill_attention_const_params_for_rope"], [83, 12, 1, "", "is_valid"], [83, 12, 1, "", "is_valid_cross_attn"]], "tensorrt_llm.layers.attention.BertAttention": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.attention.CogVLMAttention": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.attention.DeepseekV2Attention": [[83, 12, 1, "", "forward"], [83, 12, 1, "", "postprocess"], [83, 12, 1, "", "weight_loader"]], "tensorrt_llm.layers.attention.DiffusersAttention": [[83, 12, 1, "", "forward"], [83, 12, 1, "", "joint_attn_forward"]], "tensorrt_llm.layers.attention.KeyValueCacheParams": [[83, 12, 1, "", "fill_none_tensor_list"], [83, 12, 1, "", "get_first_past_key_value"], [83, 12, 1, "", "is_valid"]], "tensorrt_llm.layers.cast": [[83, 10, 1, "", "Cast"]], "tensorrt_llm.layers.cast.Cast": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.conv": [[83, 10, 1, "", "Conv1d"], [83, 10, 1, "", "Conv2d"], [83, 10, 1, "", "Conv3d"], [83, 10, 1, "", "ConvTranspose2d"]], "tensorrt_llm.layers.conv.Conv1d": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.conv.Conv2d": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.conv.Conv3d": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.conv.ConvTranspose2d": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding": [[83, 10, 1, "", "CombinedTimestepLabelEmbeddings"], [83, 10, 1, "", "CombinedTimestepTextProjEmbeddings"], [83, 10, 1, "", "Embedding"], [83, 10, 1, "", "LabelEmbedding"], [83, 10, 1, "", "PixArtAlphaTextProjection"], [83, 10, 1, "", "PromptTuningEmbedding"], [83, 10, 1, "", "SD3PatchEmbed"], [83, 10, 1, "", "TimestepEmbedding"], [83, 10, 1, "", "Timesteps"], [83, 14, 1, "", "get_1d_sincos_pos_embed_from_grid"], [83, 14, 1, "", "get_2d_sincos_pos_embed"], [83, 14, 1, "", "get_2d_sincos_pos_embed_from_grid"], [83, 14, 1, "", "get_timestep_embedding"]], "tensorrt_llm.layers.embedding.CombinedTimestepLabelEmbeddings": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.CombinedTimestepTextProjEmbeddings": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.Embedding": [[83, 12, 1, "", "forward"], [83, 12, 1, "", "postprocess"], [83, 12, 1, "", "weight_loader"]], "tensorrt_llm.layers.embedding.LabelEmbedding": [[83, 12, 1, "", "forward"], [83, 12, 1, "", "token_drop"]], "tensorrt_llm.layers.embedding.PixArtAlphaTextProjection": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.PromptTuningEmbedding": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.SD3PatchEmbed": [[83, 12, 1, "", "cropped_pos_embed"], [83, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.TimestepEmbedding": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.embedding.Timesteps": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.linear": [[83, 11, 1, "", "ColumnLinear"], [83, 10, 1, "", "Linear"], [83, 10, 1, "", "LinearBase"], [83, 10, 1, "", "RowLinear"]], "tensorrt_llm.layers.linear.Linear": [[83, 12, 1, "", "collect_and_bias"], [83, 12, 1, "", "postprocess"], [83, 12, 1, "", "tp_split_dim"]], "tensorrt_llm.layers.linear.LinearBase": [[83, 12, 1, "", "collect_and_bias"], [83, 12, 1, "", "forward"], [83, 12, 1, "", "get_weight"], [83, 12, 1, "", "multiply_and_lora"], [83, 12, 1, "", "multiply_collect"], [83, 12, 1, "", "tp_split_dim"], [83, 12, 1, "", "weight_loader"]], "tensorrt_llm.layers.linear.RowLinear": [[83, 12, 1, "", "collect_and_bias"], [83, 12, 1, "", "multiply_collect"], [83, 12, 1, "", "tp_split_dim"]], "tensorrt_llm.layers.mlp": [[83, 10, 1, "", "FusedGatedMLP"], [83, 10, 1, "", "GatedMLP"], [83, 10, 1, "", "LinearActivation"], [83, 10, 1, "", "LinearApproximateGELU"], [83, 10, 1, "", "LinearGEGLU"], [83, 10, 1, "", "LinearGELU"], [83, 10, 1, "", "LinearSwiGLU"], [83, 10, 1, "", "MLP"], [83, 14, 1, "", "fc_gate_dora"], [83, 14, 1, "", "fc_gate_lora"]], "tensorrt_llm.layers.mlp.FusedGatedMLP": [[83, 12, 1, "", "fc_gate"], [83, 12, 1, "", "fc_gate_plugin"], [83, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.GatedMLP": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearActivation": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearApproximateGELU": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearGEGLU": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearGELU": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.LinearSwiGLU": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.mlp.MLP": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization": [[83, 10, 1, "", "AdaLayerNorm"], [83, 10, 1, "", "AdaLayerNormContinuous"], [83, 10, 1, "", "AdaLayerNormZero"], [83, 10, 1, "", "AdaLayerNormZeroSingle"], [83, 10, 1, "", "GroupNorm"], [83, 10, 1, "", "LayerNorm"], [83, 10, 1, "", "RmsNorm"], [83, 10, 1, "", "SD35AdaLayerNormZeroX"]], "tensorrt_llm.layers.normalization.AdaLayerNorm": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.AdaLayerNormContinuous": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.AdaLayerNormZero": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.AdaLayerNormZeroSingle": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.GroupNorm": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.LayerNorm": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.RmsNorm": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.normalization.SD35AdaLayerNormZeroX": [[83, 12, 1, "", "forward"]], "tensorrt_llm.layers.pooling": [[83, 10, 1, "", "AvgPool2d"]], "tensorrt_llm.layers.pooling.AvgPool2d": [[83, 12, 1, "", "forward"]], "tensorrt_llm.llmapi": [[70, 10, 1, "", "BatchingType"], [70, 10, 1, "", "BuildCacheConfig"], [70, 10, 1, "", "BuildConfig"], [70, 10, 1, "", "CacheTransceiverConfig"], [70, 10, 1, "", "CalibConfig"], [70, 10, 1, "", "CapacitySchedulerPolicy"], [70, 10, 1, "", "CompletionOutput"], [70, 10, 1, "", "ContextChunkingPolicy"], [70, 10, 1, "", "DisaggregatedParams"], [70, 10, 1, "", "DynamicBatchConfig"], [70, 10, 1, "", "EagleDecodingConfig"], [70, 10, 1, "", "ExtendedRuntimePerfKnobConfig"], [70, 10, 1, "", "GuidedDecodingParams"], [70, 10, 1, "", "KvCacheConfig"], [70, 10, 1, "", "KvCacheRetentionConfig"], [70, 10, 1, "", "LLM"], [70, 11, 1, "", "LlmArgs"], [70, 10, 1, "", "LookaheadDecodingConfig"], [70, 10, 1, "", "MTPDecodingConfig"], [70, 10, 1, "", "MedusaDecodingConfig"], [70, 10, 1, "", "MpiCommSession"], [70, 10, 1, "", "NGramDecodingConfig"], [70, 10, 1, "", "QuantAlgo"], [70, 10, 1, "", "QuantConfig"], [70, 10, 1, "", "RequestError"], [70, 10, 1, "", "RequestOutput"], [70, 10, 1, "", "SamplingParams"], [70, 10, 1, "", "SchedulerConfig"], [70, 10, 1, "", "TorchLlmArgs"], [70, 10, 1, "", "TrtLlmArgs"]], "tensorrt_llm.llmapi.BatchingType": [[70, 11, 1, "", "INFLIGHT"], [70, 11, 1, "", "STATIC"]], "tensorrt_llm.llmapi.BuildCacheConfig": [[70, 12, 1, "", "__init__"], [70, 13, 1, "id7", "cache_root"], [70, 13, 1, "id8", "max_cache_storage_gb"], [70, 13, 1, "id9", "max_records"]], "tensorrt_llm.llmapi.BuildConfig": [[70, 12, 1, "", "__init__"], [70, 11, 1, "", "auto_parallel_config"], [70, 11, 1, "", "dry_run"], [70, 11, 1, "", "enable_debug_output"], [70, 11, 1, "", "force_num_profiles"], [70, 12, 1, "", "from_dict"], [70, 12, 1, "", "from_json_file"], [70, 11, 1, "", "gather_context_logits"], [70, 11, 1, "", "gather_generation_logits"], [70, 11, 1, "", "input_timing_cache"], [70, 11, 1, "", "kv_cache_type"], [70, 11, 1, "", "lora_config"], [70, 11, 1, "", "max_batch_size"], [70, 11, 1, "", "max_beam_width"], [70, 11, 1, "", "max_draft_len"], [70, 11, 1, "", "max_encoder_input_len"], [70, 11, 1, "", "max_input_len"], [70, 11, 1, "", "max_num_tokens"], [70, 11, 1, "", "max_prompt_embedding_table_size"], [70, 11, 1, "", "max_seq_len"], [70, 11, 1, "", "monitor_memory"], [70, 11, 1, "", "opt_batch_size"], [70, 11, 1, "", "opt_num_tokens"], [70, 11, 1, "", "output_timing_cache"], [70, 11, 1, "", "plugin_config"], [70, 11, 1, "", "profiling_verbosity"], [70, 11, 1, "", "speculative_decoding_mode"], [70, 11, 1, "", "strongly_typed"], [70, 12, 1, "", "to_dict"], [70, 12, 1, "", "update"], [70, 12, 1, "", "update_from_dict"], [70, 12, 1, "", "update_kv_cache_type"], [70, 11, 1, "", "use_mrope"], [70, 11, 1, "", "use_refit"], [70, 11, 1, "", "use_strip_plan"], [70, 11, 1, "", "visualize_network"], [70, 11, 1, "", "weight_sparsity"], [70, 11, 1, "", "weight_streaming"]], "tensorrt_llm.llmapi.CacheTransceiverConfig": [[70, 15, 1, "", "max_num_tokens"], [70, 11, 1, "", "model_config"]], "tensorrt_llm.llmapi.CalibConfig": [[70, 15, 1, "", "calib_batch_size"], [70, 15, 1, "", "calib_batches"], [70, 15, 1, "", "calib_dataset"], [70, 15, 1, "", "calib_max_seq_length"], [70, 15, 1, "", "device"], [70, 12, 1, "", "from_dict"], [70, 11, 1, "", "model_config"], [70, 15, 1, "", "random_seed"], [70, 12, 1, "", "to_dict"], [70, 15, 1, "", "tokenizer_max_seq_length"]], "tensorrt_llm.llmapi.CapacitySchedulerPolicy": [[70, 11, 1, "", "GUARANTEED_NO_EVICT"], [70, 11, 1, "", "MAX_UTILIZATION"], [70, 11, 1, "", "STATIC_BATCH"]], "tensorrt_llm.llmapi.CompletionOutput": [[70, 12, 1, "", "__init__"], [70, 11, 1, "", "cumulative_logprob"], [70, 11, 1, "", "disaggregated_params"], [70, 11, 1, "", "finish_reason"], [70, 11, 1, "", "generation_logits"], [70, 11, 1, "", "index"], [70, 13, 1, "id2", "length"], [70, 11, 1, "", "logprobs"], [70, 13, 1, "id3", "logprobs_diff"], [70, 11, 1, "", "prompt_logprobs"], [70, 11, 1, "", "stop_reason"], [70, 11, 1, "", "text"], [70, 13, 1, "id4", "text_diff"], [70, 11, 1, "", "token_ids"], [70, 13, 1, "id5", "token_ids_diff"]], "tensorrt_llm.llmapi.ContextChunkingPolicy": [[70, 11, 1, "", "EQUAL_PROGRESS"], [70, 11, 1, "", "FIRST_COME_FIRST_SERVED"]], "tensorrt_llm.llmapi.DisaggregatedParams": [[70, 12, 1, "", "__init__"], [70, 11, 1, "", "ctx_request_id"], [70, 11, 1, "", "draft_tokens"], [70, 11, 1, "", "first_gen_tokens"], [70, 12, 1, "", "get_context_phase_params"], [70, 12, 1, "", "get_request_type"], [70, 11, 1, "", "opaque_state"], [70, 11, 1, "", "request_type"]], "tensorrt_llm.llmapi.DynamicBatchConfig": [[70, 15, 1, "", "dynamic_batch_moving_average_window"], [70, 15, 1, "", "enable_batch_size_tuning"], [70, 15, 1, "", "enable_max_num_tokens_tuning"], [70, 11, 1, "", "model_config"]], "tensorrt_llm.llmapi.EagleDecodingConfig": [[70, 11, 1, "", "decoding_type"], [70, 15, 1, "", "dynamic_tree_max_topK"], [70, 15, 1, "", "eagle3_one_model"], [70, 15, 1, "", "eagle_choices"], [70, 12, 1, "", "from_dict"], [70, 15, 1, "", "greedy_sampling"], [70, 15, 1, "", "max_non_leaves_per_layer"], [70, 11, 1, "", "model_config"], [70, 15, 1, "", "num_eagle_layers"], [70, 15, 1, "", "posterior_threshold"], [70, 15, 1, "", "pytorch_eagle_weights_path"], [70, 15, 1, "", "use_dynamic_tree"]], "tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig": [[70, 15, 1, "", "cuda_graph_cache_size"], [70, 15, 1, "", "cuda_graph_mode"], [70, 15, 1, "", "enable_context_fmha_fp32_acc"], [70, 11, 1, "", "model_config"], [70, 15, 1, "", "multi_block_mode"]], "tensorrt_llm.llmapi.GuidedDecodingParams": [[70, 12, 1, "", "__init__"], [70, 11, 1, "", "grammar"], [70, 11, 1, "", "json"], [70, 11, 1, "", "json_object"], [70, 11, 1, "", "regex"], [70, 11, 1, "", "structural_tag"]], "tensorrt_llm.llmapi.KvCacheConfig": [[70, 15, 1, "", "copy_on_partial_reuse"], [70, 15, 1, "", "cross_kv_cache_fraction"], [70, 15, 1, "", "enable_block_reuse"], [70, 15, 1, "", "enable_partial_reuse"], [70, 15, 1, "", "event_buffer_max_size"], [70, 15, 1, "", "free_gpu_memory_fraction"], [70, 15, 1, "", "host_cache_size"], [70, 15, 1, "", "max_attention_window"], [70, 15, 1, "", "max_tokens"], [70, 11, 1, "", "model_config"], [70, 15, 1, "", "onboard_blocks"], [70, 15, 1, "", "secondary_offload_min_priority"], [70, 15, 1, "", "sink_token_length"]], "tensorrt_llm.llmapi.KvCacheRetentionConfig": [[70, 10, 1, "", "TokenRangeRetentionConfig"], [70, 12, 1, "", "__init__"], [70, 13, 1, "", "decode_duration_ms"], [70, 13, 1, "", "decode_retention_priority"], [70, 13, 1, "", "directory"], [70, 13, 1, "", "token_range_retention_configs"], [70, 13, 1, "", "transfer_mode"]], "tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig": [[70, 12, 1, "", "__init__"], [70, 13, 1, "", "duration_ms"], [70, 13, 1, "", "priority"], [70, 13, 1, "", "token_end"], [70, 13, 1, "", "token_start"]], "tensorrt_llm.llmapi.LLM": [[70, 12, 1, "", "__init__"], [70, 12, 1, "", "generate"], [70, 12, 1, "", "generate_async"], [70, 12, 1, "", "get_kv_cache_events"], [70, 12, 1, "", "get_kv_cache_events_async"], [70, 12, 1, "", "get_stats"], [70, 12, 1, "", "get_stats_async"], [70, 12, 1, "", "save"], [70, 12, 1, "", "shutdown"], [70, 13, 1, "id0", "tokenizer"], [70, 13, 1, "id1", "workspace"]], "tensorrt_llm.llmapi.LookaheadDecodingConfig": [[70, 12, 1, "", "__init__"], [70, 12, 1, "", "calculate_speculative_resource"], [70, 11, 1, "", "decoding_type"], [70, 12, 1, "", "from_dict"], [70, 15, 1, "", "max_ngram_size"], [70, 15, 1, "", "max_verification_set_size"], [70, 15, 1, "", "max_window_size"], [70, 11, 1, "", "model_config"], [70, 16, 1, "", "validate_positive_values"]], "tensorrt_llm.llmapi.MTPDecodingConfig": [[70, 11, 1, "", "decoding_type"], [70, 12, 1, "", "from_dict"], [70, 11, 1, "", "model_config"], [70, 15, 1, "", "num_nextn_predict_layers"], [70, 15, 1, "", "relaxed_delta"], [70, 15, 1, "", "relaxed_topk"], [70, 15, 1, "", "use_relaxed_acceptance_for_thinking"]], "tensorrt_llm.llmapi.MedusaDecodingConfig": [[70, 11, 1, "", "decoding_type"], [70, 12, 1, "", "from_dict"], [70, 15, 1, "", "medusa_choices"], [70, 11, 1, "", "model_config"], [70, 15, 1, "", "num_medusa_heads"]], "tensorrt_llm.llmapi.MpiCommSession": [[70, 12, 1, "", "__init__"], [70, 12, 1, "", "abort"], [70, 12, 1, "", "get_comm"], [70, 12, 1, "", "shutdown"], [70, 12, 1, "", "submit"], [70, 12, 1, "", "submit_sync"]], "tensorrt_llm.llmapi.NGramDecodingConfig": [[70, 11, 1, "", "decoding_type"], [70, 12, 1, "", "from_dict"], [70, 15, 1, "", "is_keep_all"], [70, 15, 1, "", "is_public_pool"], [70, 15, 1, "", "is_use_oldest"], [70, 15, 1, "", "max_matching_ngram_size"], [70, 11, 1, "", "model_config"], [70, 15, 1, "", "prompt_lookup_num_tokens"]], "tensorrt_llm.llmapi.QuantAlgo": [[70, 11, 1, "", "FP8"], [70, 11, 1, "", "FP8_BLOCK_SCALES"], [70, 11, 1, "", "FP8_PER_CHANNEL_PER_TOKEN"], [70, 11, 1, "", "INT8"], [70, 11, 1, "", "MIXED_PRECISION"], [70, 11, 1, "", "NO_QUANT"], [70, 11, 1, "", "NVFP4"], [70, 11, 1, "", "W4A16"], [70, 11, 1, "", "W4A16_AWQ"], [70, 11, 1, "", "W4A16_GPTQ"], [70, 11, 1, "", "W4A8_AWQ"], [70, 11, 1, "", "W4A8_QSERVE_PER_CHANNEL"], [70, 11, 1, "", "W4A8_QSERVE_PER_GROUP"], [70, 11, 1, "", "W8A16"], [70, 11, 1, "", "W8A16_GPTQ"], [70, 11, 1, "", "W8A8_SQ_PER_CHANNEL"], [70, 11, 1, "", "W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN"], [70, 11, 1, "", "W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN"], [70, 11, 1, "", "W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN"], [70, 11, 1, "", "W8A8_SQ_PER_TENSOR_PLUGIN"]], "tensorrt_llm.llmapi.QuantConfig": [[70, 12, 1, "", "__init__"], [70, 11, 1, "", "clamp_val"], [70, 11, 1, "", "exclude_modules"], [70, 12, 1, "", "from_dict"], [70, 11, 1, "", "group_size"], [70, 11, 1, "", "has_zero_point"], [70, 12, 1, "", "is_module_excluded_from_quantization"], [70, 11, 1, "", "kv_cache_quant_algo"], [70, 13, 1, "", "layer_quant_mode"], [70, 11, 1, "", "pre_quant_scale"], [70, 11, 1, "", "quant_algo"], [70, 13, 1, "", "quant_mode"], [70, 11, 1, "", "smoothquant_val"], [70, 12, 1, "", "to_dict"], [70, 11, 1, "", "use_meta_recipe"]], "tensorrt_llm.llmapi.RequestOutput": [[70, 12, 1, "", "__init__"], [70, 11, 1, "", "context_logits"], [70, 11, 1, "", "finished"], [70, 11, 1, "", "outputs"], [70, 13, 1, "id6", "prompt"], [70, 11, 1, "", "prompt_token_ids"], [70, 11, 1, "", "request_id"]], "tensorrt_llm.llmapi.SamplingParams": [[70, 12, 1, "", "__init__"], [70, 11, 1, "", "add_special_tokens"], [70, 11, 1, "", "additional_model_outputs"], [70, 11, 1, "", "apply_batched_logits_processor"], [70, 11, 1, "", "bad"], [70, 11, 1, "", "bad_token_ids"], [70, 11, 1, "", "beam_search_diversity_rate"], [70, 11, 1, "", "beam_width_array"], [70, 11, 1, "", "best_of"], [70, 11, 1, "", "detokenize"], [70, 11, 1, "", "early_stopping"], [70, 11, 1, "", "embedding_bias"], [70, 11, 1, "", "end_id"], [70, 11, 1, "", "exclude_input_from_output"], [70, 11, 1, "", "frequency_penalty"], [70, 11, 1, "", "guided_decoding"], [70, 11, 1, "", "ignore_eos"], [70, 11, 1, "", "include_stop_str_in_output"], [70, 11, 1, "", "length_penalty"], [70, 11, 1, "", "logits_processor"], [70, 11, 1, "", "logprobs"], [70, 11, 1, "", "lookahead_config"], [70, 11, 1, "", "max_tokens"], [70, 11, 1, "", "min_p"], [70, 11, 1, "", "min_tokens"], [70, 11, 1, "", "n"], [70, 11, 1, "", "no_repeat_ngram_size"], [70, 11, 1, "", "pad_id"], [70, 11, 1, "", "presence_penalty"], [70, 11, 1, "", "prompt_logprobs"], [70, 11, 1, "", "repetition_penalty"], [70, 11, 1, "", "return_context_logits"], [70, 11, 1, "", "return_encoder_output"], [70, 11, 1, "", "return_generation_logits"], [70, 11, 1, "", "return_perf_metrics"], [70, 11, 1, "", "seed"], [70, 11, 1, "", "skip_special_tokens"], [70, 11, 1, "", "spaces_between_special_tokens"], [70, 11, 1, "", "stop"], [70, 11, 1, "", "stop_token_ids"], [70, 11, 1, "", "temperature"], [70, 11, 1, "", "top_k"], [70, 11, 1, "", "top_p"], [70, 11, 1, "", "top_p_decay"], [70, 11, 1, "", "top_p_min"], [70, 11, 1, "", "top_p_reset_ids"], [70, 11, 1, "", "truncate_prompt_tokens"], [70, 11, 1, "", "use_beam_search"]], "tensorrt_llm.llmapi.SchedulerConfig": [[70, 15, 1, "", "capacity_scheduler_policy"], [70, 15, 1, "", "context_chunking_policy"], [70, 15, 1, "", "dynamic_batch_config"], [70, 11, 1, "", "model_config"]], "tensorrt_llm.llmapi.TorchLlmArgs": [[70, 15, 1, "", "attn_backend"], [70, 15, 1, "", "auto_deploy_config"], [70, 15, 1, "", "autotuner_enabled"], [70, 15, 1, "", "build_config"], [70, 16, 1, "", "convert_load_format"], [70, 15, 1, "", "cuda_graph_batch_sizes"], [70, 15, 1, "", "cuda_graph_max_batch_size"], [70, 15, 1, "", "cuda_graph_padding_enabled"], [70, 11, 1, "", "decoding_config"], [70, 15, 1, "", "disable_overlap_scheduler"], [70, 15, 1, "", "enable_iter_perf_stats"], [70, 15, 1, "", "enable_iter_req_stats"], [70, 15, 1, "", "enable_layerwise_nvtx_marker"], [70, 15, 1, "", "enable_min_latency"], [70, 15, 1, "", "enable_trtllm_sampler"], [70, 13, 1, "", "extra_resource_managers"], [70, 11, 1, "id18", "field_name"], [70, 12, 1, "", "get_pytorch_backend_config"], [70, 15, 1, "", "kv_cache_dtype"], [70, 15, 1, "", "load_format"], [70, 11, 1, "", "max_cpu_loras"], [70, 11, 1, "", "max_lora_rank"], [70, 11, 1, "", "max_loras"], [70, 15, 1, "", "mixed_sampler"], [70, 11, 1, "", "model_config"], [70, 12, 1, "", "model_post_init"], [70, 15, 1, "", "moe_backend"], [70, 15, 1, "", "moe_load_balancer"], [70, 15, 1, "", "moe_max_num_tokens"], [70, 11, 1, "id16", "msg"], [70, 15, 1, "", "print_iter_log"], [70, 15, 1, "", "torch_compile_enable_userbuffers"], [70, 15, 1, "", "torch_compile_enabled"], [70, 15, 1, "", "torch_compile_fullgraph"], [70, 15, 1, "", "torch_compile_inductor_enabled"], [70, 15, 1, "", "torch_compile_piecewise_cuda_graph"], [70, 15, 1, "", "use_cuda_graph"], [70, 15, 1, "", "use_kv_cache"], [70, 16, 1, "", "validate_cuda_graph_config"], [70, 16, 1, "", "validate_cuda_graph_max_batch_size"], [70, 11, 1, "id17", "wrapped_property"]], "tensorrt_llm.llmapi.TrtLlmArgs": [[70, 11, 1, "", "auto_parallel"], [70, 13, 1, "", "auto_parallel_config"], [70, 11, 1, "", "auto_parallel_world_size"], [70, 15, 1, "", "build_config"], [70, 15, 1, "", "calib_config"], [70, 11, 1, "", "decoding_config"], [70, 15, 1, "", "embedding_parallel_mode"], [70, 15, 1, "", "enable_build_cache"], [70, 15, 1, "", "enable_tqdm"], [70, 15, 1, "", "extended_runtime_perf_knob_config"], [70, 15, 1, "", "fast_build"], [70, 11, 1, "id33", "field_name"], [70, 11, 1, "", "max_cpu_loras"], [70, 11, 1, "", "max_lora_rank"], [70, 11, 1, "", "max_loras"], [70, 11, 1, "", "model_config"], [70, 12, 1, "", "model_post_init"], [70, 11, 1, "id31", "msg"], [70, 15, 1, "", "workspace"], [70, 11, 1, "id32", "wrapped_property"]], "tensorrt_llm.models": [[84, 10, 1, "", "BaichuanForCausalLM"], [84, 10, 1, "", "BertForQuestionAnswering"], [84, 10, 1, "", "BertForSequenceClassification"], [84, 10, 1, "", "BertModel"], [84, 10, 1, "", "BloomForCausalLM"], [84, 10, 1, "", "BloomModel"], [84, 10, 1, "", "CLIPVisionTransformer"], [84, 10, 1, "", "ChatGLMConfig"], [84, 10, 1, "", "ChatGLMForCausalLM"], [84, 10, 1, "", "ChatGLMModel"], [84, 10, 1, "", "CogVLMConfig"], [84, 10, 1, "", "CogVLMForCausalLM"], [84, 10, 1, "", "CohereForCausalLM"], [84, 10, 1, "", "DbrxConfig"], [84, 10, 1, "", "DbrxForCausalLM"], [84, 10, 1, "", "DecoderModel"], [84, 10, 1, "", "DeepseekForCausalLM"], [84, 10, 1, "", "DeepseekV2ForCausalLM"], [84, 10, 1, "", "DiT"], [84, 10, 1, "", "EagleForCausalLM"], [84, 10, 1, "", "EncoderModel"], [84, 10, 1, "", "FalconConfig"], [84, 10, 1, "", "FalconForCausalLM"], [84, 10, 1, "", "FalconModel"], [84, 10, 1, "", "GPTConfig"], [84, 10, 1, "", "GPTForCausalLM"], [84, 10, 1, "", "GPTJConfig"], [84, 10, 1, "", "GPTJForCausalLM"], [84, 10, 1, "", "GPTJModel"], [84, 10, 1, "", "GPTModel"], [84, 10, 1, "", "GPTNeoXForCausalLM"], [84, 10, 1, "", "GPTNeoXModel"], [84, 10, 1, "", "GemmaConfig"], [84, 10, 1, "", "GemmaForCausalLM"], [84, 10, 1, "", "LLaMAConfig"], [84, 10, 1, "", "LLaMAForCausalLM"], [84, 10, 1, "", "LLaMAModel"], [84, 10, 1, "", "LlavaNextVisionConfig"], [84, 10, 1, "", "LlavaNextVisionWrapper"], [84, 10, 1, "", "MLLaMAForCausalLM"], [84, 10, 1, "", "MPTForCausalLM"], [84, 10, 1, "", "MPTModel"], [84, 10, 1, "", "MambaForCausalLM"], [84, 10, 1, "", "MedusaConfig"], [84, 10, 1, "", "MedusaForCausalLm"], [84, 10, 1, "", "OPTForCausalLM"], [84, 10, 1, "", "OPTModel"], [84, 10, 1, "", "Phi3ForCausalLM"], [84, 10, 1, "", "Phi3Model"], [84, 10, 1, "", "PhiForCausalLM"], [84, 10, 1, "", "PhiModel"], [84, 10, 1, "", "PretrainedConfig"], [84, 10, 1, "", "PretrainedModel"], [84, 10, 1, "", "ReDrafterForCausalLM"], [84, 10, 1, "", "RecurrentGemmaForCausalLM"], [84, 11, 1, "", "RobertaForQuestionAnswering"], [84, 11, 1, "", "RobertaForSequenceClassification"], [84, 11, 1, "", "RobertaModel"], [84, 10, 1, "", "SD3Transformer2DModel"], [84, 10, 1, "", "SpeculativeDecodingMode"], [84, 10, 1, "", "WhisperEncoder"]], "tensorrt_llm.models.BaichuanForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "quantize"]], "tensorrt_llm.models.BertForQuestionAnswering": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.BertForSequenceClassification": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.BertModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.BloomModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.CLIPVisionTransformer": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.ChatGLMConfig": [[84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "to_dict"]], "tensorrt_llm.models.ChatGLMForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "prepare_inputs"], [84, 12, 1, "", "quantize"]], "tensorrt_llm.models.ChatGLMModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.CogVLMConfig": [[84, 12, 1, "", "to_dict"]], "tensorrt_llm.models.CogVLMForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "default_plugin_config"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "quantize"]], "tensorrt_llm.models.CohereForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.DbrxConfig": [[84, 12, 1, "", "to_dict"]], "tensorrt_llm.models.DbrxForCausalLM": [[84, 11, 1, "", "config_class"]], "tensorrt_llm.models.DecoderModel": [[84, 12, 1, "", "check_config"], [84, 12, 1, "", "forward"], [84, 12, 1, "", "precompute_relative_attention_bias"], [84, 12, 1, "", "prepare_inputs"], [84, 12, 1, "", "use_lora"]], "tensorrt_llm.models.DeepseekForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.DeepseekV2ForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.DiT": [[84, 12, 1, "", "check_config"], [84, 12, 1, "", "forward"], [84, 12, 1, "", "forward_with_cfg"], [84, 12, 1, "", "forward_without_cfg"], [84, 12, 1, "", "prepare_inputs"], [84, 12, 1, "", "unpatchify"]], "tensorrt_llm.models.EagleForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "forward"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "prepare_inputs"]], "tensorrt_llm.models.EncoderModel": [[84, 12, 1, "", "check_config"], [84, 12, 1, "", "forward"], [84, 12, 1, "", "precompute_relative_attention_bias"], [84, 12, 1, "", "prepare_inputs"], [84, 12, 1, "", "use_lora"], [84, 12, 1, "", "use_prompt_tuning"]], "tensorrt_llm.models.FalconConfig": [[84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "to_dict"]], "tensorrt_llm.models.FalconForCausalLM": [[84, 12, 1, "", "check_config"], [84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.FalconModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.GPTConfig": [[84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "from_nemo"], [84, 12, 1, "", "to_dict"]], "tensorrt_llm.models.GPTForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "from_nemo"], [84, 12, 1, "", "quantize"], [84, 12, 1, "", "use_lora"]], "tensorrt_llm.models.GPTJConfig": [[84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "to_dict"]], "tensorrt_llm.models.GPTJForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.GPTJModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.GPTModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.GPTNeoXModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.GemmaConfig": [[84, 11, 1, "", "GEMMA2_ADDED_FIELDS"], [84, 11, 1, "", "GEMMA3_ADDED_FIELDS"], [84, 11, 1, "", "GEMMA_ADDED_FIELDS"], [84, 11, 1, "", "VERBATIM"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "gemma2_config"], [84, 12, 1, "", "gemma3_config"], [84, 12, 1, "", "get_hf_config"], [84, 13, 1, "", "is_gemma_2"], [84, 13, 1, "", "is_gemma_3"], [84, 12, 1, "", "to_dict"]], "tensorrt_llm.models.GemmaForCausalLM": [[84, 11, 1, "", "NATIVE_QUANT_FLOW"], [84, 12, 1, "", "assert_valid_quant_algo"], [84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "quantize"], [84, 12, 1, "", "use_lora"]], "tensorrt_llm.models.LLaMAConfig": [[84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "from_meta_ckpt"], [84, 12, 1, "", "to_dict"]], "tensorrt_llm.models.LLaMAForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "default_plugin_config"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "from_meta_ckpt"], [84, 12, 1, "", "quantize"], [84, 12, 1, "", "use_lora"]], "tensorrt_llm.models.LLaMAModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.LlavaNextVisionConfig": [[84, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.LlavaNextVisionWrapper": [[84, 12, 1, "", "forward"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "prepare_inputs"], [84, 12, 1, "", "save_checkpoint"]], "tensorrt_llm.models.MLLaMAForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "forward"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "prepare_inputs"], [84, 12, 1, "", "use_lora"]], "tensorrt_llm.models.MPTForCausalLM": [[84, 12, 1, "", "check_config"]], "tensorrt_llm.models.MPTModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.MambaForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "forward"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "prepare_inputs"]], "tensorrt_llm.models.MedusaConfig": [[84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "to_dict"]], "tensorrt_llm.models.MedusaForCausalLm": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"]], "tensorrt_llm.models.OPTForCausalLM": [[84, 12, 1, "", "check_config"]], "tensorrt_llm.models.OPTModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.Phi3ForCausalLM": [[84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "use_lora"]], "tensorrt_llm.models.Phi3Model": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.PhiForCausalLM": [[84, 12, 1, "", "check_config"], [84, 11, 1, "", "config_class"], [84, 12, 1, "", "from_hugging_face"], [84, 12, 1, "", "use_lora"]], "tensorrt_llm.models.PhiModel": [[84, 12, 1, "", "forward"]], "tensorrt_llm.models.PretrainedConfig": [[84, 12, 1, "", "create_runtime_defaults"], [84, 12, 1, "", "for_each_rank"], [84, 12, 1, "", "from_checkpoint"], [84, 12, 1, "", "from_dict"], [84, 12, 1, "", "from_json_file"], [84, 12, 1, "", "get_config_group"], [84, 12, 1, "", "has_config_group"], [84, 13, 1, "", "kv_dtype"], [84, 13, 1, "", "quant_algo"], [84, 13, 1, "", "quant_mode"], [84, 12, 1, "", "set_if_not_exist"], [84, 12, 1, "", "set_rank"], [84, 12, 1, "", "to_dict"], [84, 12, 1, "", "to_json_file"], [84, 12, 1, "", "to_layer_quant_config"]], "tensorrt_llm.models.PretrainedModel": [[84, 12, 1, "", "check_config"], [84, 12, 1, "", "from_checkpoint"], [84, 12, 1, "", "from_config"], [84, 12, 1, "", "load"], [84, 12, 1, "", "prepare_inputs"], [84, 12, 1, "", "quantize"], [84, 12, 1, "", "release"], [84, 12, 1, "", "save_checkpoint"]], "tensorrt_llm.models.ReDrafterForCausalLM": [[84, 12, 1, "", "forward"], [84, 12, 1, "", "prepare_inputs"]], "tensorrt_llm.models.RecurrentGemmaForCausalLM": [[84, 12, 1, "", "forward"], [84, 12, 1, "", "prepare_inputs"], [84, 12, 1, "", "prepare_recurrent_inputs"]], "tensorrt_llm.models.SD3Transformer2DModel": [[84, 13, 1, "", "attn_processors"], [84, 11, 1, "", "config_class"], [84, 12, 1, "", "disable_forward_chunking"], [84, 12, 1, "", "enable_forward_chunking"], [84, 12, 1, "", "forward"], [84, 12, 1, "", "from_pretrained"], [84, 12, 1, "", "fuse_qkv_projections"], [84, 12, 1, "", "load"], [84, 12, 1, "", "prepare_inputs"], [84, 12, 1, "", "set_attn_processor"], [84, 12, 1, "", "unfuse_qkv_projections"]], "tensorrt_llm.models.SpeculativeDecodingMode": [[84, 11, 1, "", "DRAFT_TOKENS_EXTERNAL"], [84, 11, 1, "", "EAGLE"], [84, 11, 1, "", "EXPLICIT_DRAFT_TOKENS"], [84, 11, 1, "", "LOOKAHEAD_DECODING"], [84, 11, 1, "", "MEDUSA"], [84, 11, 1, "", "NGRAM"], [84, 11, 1, "", "NONE"], [84, 12, 1, "", "from_arguments"]], "tensorrt_llm.models.WhisperEncoder": [[84, 12, 1, "", "forward"], [84, 12, 1, "", "precompute_relative_attention_bias"], [84, 12, 1, "", "prepare_inputs"]], "tensorrt_llm.plugin": [[85, 10, 1, "", "PluginConfig"]], "tensorrt_llm.plugin.PluginConfig": [[85, 12, 1, "", "to_legacy_setting"]], "tensorrt_llm.quantization": [[86, 10, 1, "", "QuantAlgo"], [86, 10, 1, "", "QuantMode"], [86, 14, 1, "", "quantize_and_export"]], "tensorrt_llm.runtime": [[87, 10, 1, "", "ChatGLMGenerationSession"], [87, 10, 1, "", "EncDecModelRunner"], [87, 10, 1, "", "GenerationSequence"], [87, 10, 1, "", "GenerationSession"], [87, 10, 1, "", "KVCacheManager"], [87, 10, 1, "", "LogitsProcessor"], [87, 10, 1, "", "LogitsProcessorList"], [87, 10, 1, "", "ModelConfig"], [87, 10, 1, "", "ModelRunner"], [87, 10, 1, "", "ModelRunnerCpp"], [87, 10, 1, "", "MultimodalModelRunner"], [87, 10, 1, "", "QWenForCausalLMGenerationSession"], [87, 10, 1, "", "SamplingConfig"], [87, 10, 1, "", "Session"], [87, 10, 1, "", "StoppingCriteria"], [87, 10, 1, "", "StoppingCriteriaList"], [87, 10, 1, "", "TensorInfo"], [87, 14, 1, "", "decode_words_list"]], "tensorrt_llm.runtime.EncDecModelRunner": [[87, 12, 1, "", "encoder_run"], [87, 12, 1, "", "from_engine"], [87, 12, 1, "", "generate"], [87, 12, 1, "", "process_input"]], "tensorrt_llm.runtime.GenerationSequence": [[87, 12, 1, "", "get_batch_idx"], [87, 12, 1, "", "get_seq_idx"]], "tensorrt_llm.runtime.GenerationSession": [[87, 11, 1, "", "batch_size"], [87, 11, 1, "", "buffer_allocated"], [87, 13, 1, "", "context_mem_size"], [87, 13, 1, "", "conv_kernel"], [87, 13, 1, "", "cross_attention"], [87, 11, 1, "", "cuda_graph_mode"], [87, 12, 1, "", "cuda_stream_guard"], [87, 11, 1, "", "debug_mode"], [87, 11, 1, "", "debug_tensors_to_save"], [87, 12, 1, "", "decode"], [87, 12, 1, "", "decode_batch"], [87, 12, 1, "", "decode_regular"], [87, 12, 1, "", "decode_stream"], [87, 11, 1, "", "device"], [87, 13, 1, "", "dtype"], [87, 12, 1, "", "dump_debug_buffers"], [87, 12, 1, "", "early_stop_criteria"], [87, 13, 1, "", "engine_inspector"], [87, 12, 1, "", "filter_medusa_logits"], [87, 12, 1, "", "finalize_decoder"], [87, 12, 1, "", "find_best_medusa_path"], [87, 13, 1, "", "first_layer"], [87, 13, 1, "", "gather_context_logits"], [87, 13, 1, "", "gather_generation_logits"], [87, 13, 1, "", "gemm_allreduce_plugin"], [87, 12, 1, "", "get_next_medusa_tokens"], [87, 12, 1, "", "get_num_heads_kv"], [87, 12, 1, "", "handle_per_step"], [87, 13, 1, "", "has_position_embedding"], [87, 13, 1, "", "has_token_type_embedding"], [87, 13, 1, "", "head_size"], [87, 13, 1, "", "hidden_size"], [87, 13, 1, "", "is_medusa_mode"], [87, 13, 1, "", "is_redrafter_mode"], [87, 13, 1, "", "kv_cache_type"], [87, 13, 1, "", "last_layer"], [87, 12, 1, "", "locate_accepted_draft_tokens"], [87, 11, 1, "", "mapping"], [87, 13, 1, "", "max_draft_tokens"], [87, 13, 1, "", "max_prompt_embedding_table_size"], [87, 12, 1, "", "medusa_decode_and_verify"], [87, 11, 1, "", "medusa_paths"], [87, 11, 1, "", "medusa_position_offsets"], [87, 11, 1, "", "medusa_temperature"], [87, 11, 1, "", "medusa_topks"], [87, 11, 1, "", "medusa_tree_ids"], [87, 12, 1, "", "next_medusa_input_ids"], [87, 11, 1, "", "num_draft_tokens"], [87, 13, 1, "", "num_heads"], [87, 13, 1, "", "num_layers"], [87, 13, 1, "", "num_medusa_heads"], [87, 13, 1, "", "paged_kv_cache"], [87, 13, 1, "", "paged_state"], [87, 12, 1, "", "pp_communicate_final_output_ids"], [87, 12, 1, "", "pp_communicate_new_tokens"], [87, 12, 1, "", "process_logits_including_draft"], [87, 13, 1, "", "profiler"], [87, 13, 1, "", "quant_mode"], [87, 13, 1, "", "remove_input_padding"], [87, 12, 1, "", "reorder_kv_cache_for_beam_search"], [87, 13, 1, "", "rnn_conv_dim_size"], [87, 13, 1, "", "rnn_head_size"], [87, 13, 1, "", "rnn_hidden_size"], [87, 11, 1, "", "runtime"], [87, 12, 1, "", "setup"], [87, 13, 1, "", "state_dtype"], [87, 13, 1, "", "state_size"], [87, 13, 1, "", "tokens_per_block"], [87, 12, 1, "", "update_output_ids_by_offset"], [87, 13, 1, "", "use_gemm_allreduce_plugin"], [87, 13, 1, "", "use_gpt_attention_plugin"], [87, 13, 1, "", "use_kv_cache"], [87, 13, 1, "", "use_lora_plugin"], [87, 13, 1, "", "use_mamba_conv1d_plugin"], [87, 13, 1, "", "vocab_size"]], "tensorrt_llm.runtime.KVCacheManager": [[87, 12, 1, "", "add_sequence"], [87, 12, 1, "", "get_block_offsets"], [87, 12, 1, "", "step"]], "tensorrt_llm.runtime.ModelConfig": [[87, 11, 1, "", "conv_kernel"], [87, 11, 1, "", "cross_attention"], [87, 11, 1, "", "dtype"], [87, 11, 1, "", "gather_context_logits"], [87, 11, 1, "", "gather_generation_logits"], [87, 11, 1, "", "gemm_allreduce_plugin"], [87, 11, 1, "", "gpt_attention_plugin"], [87, 11, 1, "", "gpu_weights_percent"], [87, 11, 1, "", "has_position_embedding"], [87, 11, 1, "", "has_token_type_embedding"], [87, 11, 1, "", "head_size"], [87, 11, 1, "", "hidden_size"], [87, 11, 1, "", "kv_cache_type"], [87, 11, 1, "", "language_adapter_config"], [87, 11, 1, "", "layer_types"], [87, 11, 1, "", "lora_plugin"], [87, 11, 1, "", "lora_target_modules"], [87, 11, 1, "", "mamba_conv1d_plugin"], [87, 11, 1, "", "max_batch_size"], [87, 11, 1, "", "max_beam_width"], [87, 11, 1, "", "max_medusa_tokens"], [87, 11, 1, "", "max_prompt_embedding_table_size"], [87, 11, 1, "", "model_name"], [87, 11, 1, "", "num_heads"], [87, 11, 1, "", "num_kv_heads"], [87, 11, 1, "", "num_kv_heads_per_cross_attn_layer"], [87, 11, 1, "", "num_kv_heads_per_layer"], [87, 11, 1, "", "num_layers"], [87, 11, 1, "", "num_medusa_heads"], [87, 11, 1, "", "paged_state"], [87, 11, 1, "", "quant_mode"], [87, 11, 1, "", "redrafter_draft_len_per_beam"], [87, 11, 1, "", "redrafter_num_beams"], [87, 11, 1, "", "remove_input_padding"], [87, 11, 1, "", "rnn_conv_dim_size"], [87, 11, 1, "", "rnn_head_size"], [87, 11, 1, "", "rnn_hidden_size"], [87, 11, 1, "", "skip_cross_attn_blocks"], [87, 11, 1, "", "skip_cross_kv"], [87, 11, 1, "", "state_dtype"], [87, 11, 1, "", "state_size"], [87, 11, 1, "", "tokens_per_block"], [87, 11, 1, "", "trtllm_modules_to_hf_modules"], [87, 11, 1, "", "vocab_size"]], "tensorrt_llm.runtime.ModelRunner": [[87, 13, 1, "", "dtype"], [87, 12, 1, "", "from_dir"], [87, 12, 1, "", "from_engine"], [87, 13, 1, "", "gather_context_logits"], [87, 13, 1, "", "gather_generation_logits"], [87, 12, 1, "", "generate"], [87, 13, 1, "", "hidden_size"], [87, 13, 1, "", "mapping"], [87, 13, 1, "", "max_prompt_embedding_table_size"], [87, 13, 1, "", "max_sequence_length"], [87, 13, 1, "", "num_heads"], [87, 13, 1, "", "num_layers"], [87, 13, 1, "", "remove_input_padding"], [87, 12, 1, "", "serialize_engine"], [87, 13, 1, "", "use_lora_plugin"], [87, 13, 1, "", "vocab_size"], [87, 13, 1, "", "vocab_size_padded"]], "tensorrt_llm.runtime.ModelRunnerCpp": [[87, 13, 1, "", "dtype"], [87, 12, 1, "", "from_dir"], [87, 13, 1, "", "gather_context_logits"], [87, 13, 1, "", "gather_generation_logits"], [87, 12, 1, "", "generate"], [87, 13, 1, "", "hidden_size"], [87, 13, 1, "", "max_prompt_embedding_table_size"], [87, 13, 1, "", "max_sequence_length"], [87, 13, 1, "", "num_heads"], [87, 13, 1, "", "num_layers"], [87, 13, 1, "", "remove_input_padding"], [87, 13, 1, "", "vocab_size"], [87, 13, 1, "", "vocab_size_padded"]], "tensorrt_llm.runtime.MultimodalModelRunner": [[87, 13, 1, "", "audio_engine_dir"], [87, 13, 1, "", "cpp_e2e"], [87, 13, 1, "", "cpp_llm_only"], [87, 12, 1, "", "generate"], [87, 12, 1, "", "get_audio_features"], [87, 12, 1, "", "get_rope_index"], [87, 12, 1, "", "get_visual_features"], [87, 12, 1, "", "init_audio_encoder"], [87, 12, 1, "", "init_image_encoder"], [87, 12, 1, "", "init_llm"], [87, 12, 1, "", "init_processor"], [87, 12, 1, "", "init_tokenizer"], [87, 13, 1, "", "llm_engine_dir"], [87, 12, 1, "", "load_test_audio"], [87, 12, 1, "", "load_test_data"], [87, 12, 1, "", "prepare_position_ids_for_cogvlm"], [87, 12, 1, "", "preprocess"], [87, 12, 1, "", "ptuning_setup"], [87, 12, 1, "", "ptuning_setup_fuyu"], [87, 12, 1, "", "ptuning_setup_llava_next"], [87, 12, 1, "", "ptuning_setup_phi3"], [87, 12, 1, "", "ptuning_setup_pixtral"], [87, 13, 1, "", "python_e2e"], [87, 12, 1, "", "run"], [87, 12, 1, "", "setup_fake_prompts"], [87, 12, 1, "", "setup_fake_prompts_qwen2vl"], [87, 12, 1, "", "setup_fake_prompts_vila"], [87, 12, 1, "", "setup_inputs"], [87, 12, 1, "", "split_prompt_by_images"], [87, 12, 1, "", "tokenizer_image_token"], [87, 12, 1, "", "video_preprocess"], [87, 13, 1, "", "visual_engine_dir"]], "tensorrt_llm.runtime.QWenForCausalLMGenerationSession": [[87, 12, 1, "", "generate"]], "tensorrt_llm.runtime.SamplingConfig": [[87, 11, 1, "", "bad_words_list"], [87, 11, 1, "", "beam_search_diversity_rate"], [87, 11, 1, "", "early_stopping"], [87, 11, 1, "", "end_id"], [87, 11, 1, "", "frequency_penalty"], [87, 11, 1, "", "length_penalty"], [87, 11, 1, "", "max_attention_window_size"], [87, 11, 1, "", "max_new_tokens"], [87, 11, 1, "", "min_length"], [87, 11, 1, "", "min_p"], [87, 11, 1, "", "no_repeat_ngram_size"], [87, 11, 1, "", "num_beams"], [87, 11, 1, "", "num_return_sequences"], [87, 11, 1, "", "output_cum_log_probs"], [87, 11, 1, "", "output_log_probs"], [87, 11, 1, "", "output_sequence_lengths"], [87, 11, 1, "", "pad_id"], [87, 11, 1, "", "presence_penalty"], [87, 11, 1, "", "random_seed"], [87, 11, 1, "", "repetition_penalty"], [87, 11, 1, "", "return_dict"], [87, 11, 1, "", "sink_token_length"], [87, 11, 1, "", "stop_words_list"], [87, 11, 1, "", "temperature"], [87, 11, 1, "", "top_k"], [87, 11, 1, "", "top_p"], [87, 11, 1, "", "top_p_decay"], [87, 11, 1, "", "top_p_min"], [87, 11, 1, "", "top_p_reset_ids"], [87, 12, 1, "", "update"], [87, 11, 1, "", "use_beam_hyps"]], "tensorrt_llm.runtime.Session": [[87, 13, 1, "", "context"], [87, 13, 1, "", "context_mem_size"], [87, 13, 1, "", "engine"], [87, 12, 1, "", "from_engine"], [87, 12, 1, "", "from_serialized_engine"], [87, 12, 1, "", "infer_shapes"], [87, 12, 1, "", "run"], [87, 13, 1, "", "runtime"], [87, 12, 1, "", "set_shapes"]], "tensorrt_llm.runtime.TensorInfo": [[87, 11, 1, "", "dtype"], [87, 11, 1, "", "name"], [87, 12, 1, "", "numel"], [87, 11, 1, "", "shape"], [87, 12, 1, "", "squeeze"], [87, 12, 1, "", "view"]], "trtllm-serve-disaggregated": [[30, 17, 1, "cmdoption-trtllm-serve-disaggregated-c", "--config_file"], [30, 17, 1, "cmdoption-trtllm-serve-disaggregated-r", "--request_timeout"], [30, 17, 1, "cmdoption-trtllm-serve-disaggregated-t", "--server_start_timeout"], [30, 17, 1, "cmdoption-trtllm-serve-disaggregated-c", "-c"], [30, 17, 1, "cmdoption-trtllm-serve-disaggregated-r", "-r"], [30, 17, 1, "cmdoption-trtllm-serve-disaggregated-t", "-t"]], "trtllm-serve-disaggregated_mpi_worker": [[30, 17, 1, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", "--config_file"], [30, 17, 1, "cmdoption-trtllm-serve-disaggregated_mpi_worker-log_level", "--log_level"], [30, 17, 1, "cmdoption-trtllm-serve-disaggregated_mpi_worker-c", "-c"]], "trtllm-serve-serve": [[30, 17, 1, "cmdoption-trtllm-serve-serve-backend", "--backend"], [30, 17, 1, "cmdoption-trtllm-serve-serve-cluster_size", "--cluster_size"], [30, 17, 1, "cmdoption-trtllm-serve-serve-ep_size", "--ep_size"], [30, 17, 1, "cmdoption-trtllm-serve-serve-extra_llm_api_options", "--extra_llm_api_options"], [30, 17, 1, "cmdoption-trtllm-serve-serve-gpus_per_node", "--gpus_per_node"], [30, 17, 1, "cmdoption-trtllm-serve-serve-host", "--host"], [30, 17, 1, "cmdoption-trtllm-serve-serve-kv_cache_free_gpu_memory_fraction", "--kv_cache_free_gpu_memory_fraction"], [30, 17, 1, "cmdoption-trtllm-serve-serve-log_level", "--log_level"], [30, 17, 1, "cmdoption-trtllm-serve-serve-max_batch_size", "--max_batch_size"], [30, 17, 1, "cmdoption-trtllm-serve-serve-max_beam_width", "--max_beam_width"], [30, 17, 1, "cmdoption-trtllm-serve-serve-max_num_tokens", "--max_num_tokens"], [30, 17, 1, "cmdoption-trtllm-serve-serve-max_seq_len", "--max_seq_len"], [30, 17, 1, "cmdoption-trtllm-serve-serve-num_postprocess_workers", "--num_postprocess_workers"], [30, 17, 1, "cmdoption-trtllm-serve-serve-port", "--port"], [30, 17, 1, "cmdoption-trtllm-serve-serve-pp_size", "--pp_size"], [30, 17, 1, "cmdoption-trtllm-serve-serve-reasoning_parser", "--reasoning_parser"], [30, 17, 1, "cmdoption-trtllm-serve-serve-tokenizer", "--tokenizer"], [30, 17, 1, "cmdoption-trtllm-serve-serve-tp_size", "--tp_size"], [30, 17, 1, "cmdoption-trtllm-serve-serve-trust_remote_code", "--trust_remote_code"], [30, 17, 1, "cmdoption-trtllm-serve-serve-arg-MODEL", "MODEL"]]}, "objnames": {"0": ["c", "macro", "C macro"], "1": ["cpp", "type", "C++ type"], "2": ["cpp", "class", "C++ class"], "3": ["cpp", "function", "C++ function"], "4": ["cpp", "functionParam", "C++ function parameter"], "5": ["cpp", "member", "C++ member"], "6": ["cpp", "enum", "C++ enum"], "7": ["cpp", "enumerator", "C++ enumerator"], "8": ["cpp", "templateParam", "C++ template parameter"], "9": ["py", "module", "Python module"], "10": ["py", "class", "Python class"], "11": ["py", "attribute", "Python attribute"], "12": ["py", "method", "Python method"], "13": ["py", "property", "Python property"], "14": ["py", "function", "Python function"], "15": ["py", "pydantic_field", "Python field"], "16": ["py", "pydantic_validator", "Python validator"], "17": ["std", "cmdoption", "program option"]}, "objtypes": {"0": "c:macro", "1": "cpp:type", "2": "cpp:class", "3": "cpp:function", "4": "cpp:functionParam", "5": "cpp:member", "6": "cpp:enum", "7": "cpp:enumerator", "8": "cpp:templateParam", "9": "py:module", "10": "py:class", "11": "py:attribute", "12": "py:method", "13": "py:property", "14": "py:function", "15": "py:pydantic_field", "16": "py:pydantic_validator", "17": "std:cmdoption"}, "terms": {"": [0, 1, 2, 3, 4, 6, 7, 8, 12, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 26, 27, 28, 29, 31, 45, 46, 50, 51, 52, 58, 65, 69, 70, 71, 73, 75, 77, 78, 79, 80, 82, 83, 84, 87, 88, 89, 90, 92, 93, 95, 96, 97, 98], "0": [0, 1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 15, 16, 17, 19, 20, 22, 23, 25, 26, 27, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 57, 58, 59, 60, 62, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 79, 80, 81, 82, 83, 84, 87, 88, 89, 91, 92, 94, 95, 99], "00": [16, 26, 55, 56, 57, 73, 74, 75, 92], "000": [20, 73], "0000": [73, 75], "0007503032684326172": 30, "001": 51, "0012": 73, "0017": 74, "003": 74, "0047": 92, "005": 74, "0070": 92, "0071": 92, "0096": 92, "00978": 90, "01": [25, 26, 55, 56, 57, 73, 74, 89, 93], "014": 23, "0158": 75, "016": 74, "0162": 77, "0165": 79, "017": 74, "02": [74, 93], "021": 74, "022": 74, "0235": 92, "0260": 92, "0273": 92, "028": 74, "0294": 92, "03": [79, 92, 93], "032": 26, "0339": 74, "03762": 82, "03961": 4, "03x": 27, "04": [66, 67, 74, 91, 93, 94], "043": 74, "0449": 92, "0461": 20, "0463": 74, "05": [74, 82, 83, 84, 92, 93], "05100": 82, "0523": 92, "055": 74, "0554": 75, "0560": 92, "0563": 74, "06": [26, 73, 74, 82, 83], "0630": 92, "0669": 20, "068": 74, "0682": 92, "0689e": 73, "07": [25, 26, 74, 93], "0704": 75, "0713": 92, "0723": 92, "0732": 92, "0758": 20, "0772": 20, "0776": 92, "08": [26, 74, 79], "0804": 92, "082": 74, "0838": 74, "0881": 80, "089": 74, "09": [26, 92], "0903": 92, "0910": 92, "092": 74, "09353": 10, "0964": 74, "09685": 10, "097": 74, "09f": [0, 1], "0b": 2, "0e": 6, "0f": [0, 6, 70], "0rc1": 73, "0u": 1, "0x": 22, "0x0000000000000000": 93, "1": [0, 1, 2, 3, 5, 6, 7, 9, 10, 11, 12, 13, 15, 17, 19, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 35, 36, 37, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 58, 59, 60, 62, 64, 66, 67, 69, 70, 72, 73, 75, 76, 77, 79, 81, 82, 83, 84, 86, 87, 88, 91, 92, 94, 98], "10": [0, 9, 10, 12, 20, 25, 26, 27, 30, 36, 38, 43, 44, 51, 59, 64, 67, 70, 73, 74, 75, 77, 80, 82, 89, 91, 92], "100": [0, 9, 20, 30, 38, 56, 72, 73, 75, 88], "1000": [0, 72, 73, 74, 75], "10000": [82, 83, 84], "1003": 93, "100gb": 28, "101": 9, "101230": 51, "101978": 74, "102": [9, 22], "1024": [1, 6, 15, 20, 23, 25, 29, 36, 43, 44, 51, 54, 70, 73, 74, 75, 79, 82, 83, 92], "103": 9, "104": 93, "10438": 90, "1045": 92, "1047": 73, "1050": 92, "1051": 75, "1059": 73, "106563": 74, "1072": 92, "107501": 74, "10764": 53, "10774": 0, "1079": 19, "108": 74, "1082": 92, "10858": 36, "10b": [69, 82, 93], "10m": 22, "11": [0, 10, 12, 20, 23, 25, 64, 73, 74, 77, 82, 92], "11023": 73, "110804": 74, "110b": 93, "111": [22, 26], "111302": 74, "111618": 74, "111668": 74, "1118": 93, "1123": 93, "1134": 89, "1135": 92, "1141": 92, "1148": 93, "11489": 20, "11490": 73, "1151": 20, "115716": 74, "1160": [30, 37], "117": 74, "1178": 73, "1181": 93, "1183": 93, "119": 73, "11943": 73, "11947": 36, "1196": 20, "11b": [91, 93], "12": [0, 10, 15, 22, 26, 36, 64, 66, 67, 73, 74, 77, 79, 82, 92], "1207": 53, "1212": 92, "121847": 73, "1219": 20, "122": 73, "1225": 82, "12288": 73, "123": [30, 38, 39], "1234": [70, 84], "1239": 93, "1242": 93, "1248": 93, "125": 73, "1252": [19, 73], "1256": 93, "125m": [12, 15], "126": 73, "1267": 93, "127": 82, "1272": 92, "128": [0, 1, 5, 9, 10, 13, 16, 20, 21, 22, 23, 24, 25, 26, 30, 36, 38, 39, 49, 56, 70, 73, 74, 93], "1284": 93, "1287": 77, "1290": 92, "1291504": 75, "1293": 19, "12945": 20, "129498": 20, "13": [5, 10, 24, 28, 64, 73, 74, 75, 82, 92], "1300": 45, "13044": 53, "131072": [73, 75], "13195": 73, "132": [73, 74], "1323": 93, "1328": 93, "1329": 93, "133": 93, "13368": 73, "1337": 93, "1341": 20, "1343": 93, "1344": 93, "13525": 73, "13598": 73, "1363": 53, "137": 73, "1378": 92, "139": 74, "1392": 93, "13b": 22, "14": [10, 15, 25, 64, 73, 74, 77, 79, 80, 92], "140g": 19, "141": 23, "1418": 73, "141gb": [21, 74], "142": 28, "1424": 93, "1436": [20, 93], "1437": 92, "144": 77, "1446": 93, "1447": 93, "14480": 73, "1449": 93, "145": [79, 80], "1459": 92, "146": [79, 80], "1467": 93, "147": [75, 77, 79, 80], "1480": 93, "1486": 93, "149": [92, 93], "15": [10, 26, 64, 73, 74, 80, 82, 92], "150": 72, "1500": 74, "15043": 36, "1514": 93, "1529": 93, "1534": 93, "1535": 93, "1536": 20, "1537": 93, "1539": 93, "154": 26, "1552": 93, "1556": 92, "15585": 73, "1562": 93, "1564": [75, 79, 80], "158": 20, "1583": 93, "1584": 20, "1585": 75, "15889": 53, "1589": 93, "1590": 93, "1597": 77, "15u": 28, "16": [0, 5, 10, 11, 12, 16, 20, 22, 25, 26, 30, 33, 35, 55, 56, 57, 64, 65, 73, 74, 75, 76, 82, 83, 84, 89, 90, 92], "160": 93, "1607": 73, "161": [30, 37, 73], "1625": 77, "1626": 93, "163": 21, "1637": 93, "16384": [77, 79], "164": 26, "1642": 93, "1650": 93, "1660": 93, "1669": 93, "167": [73, 74], "1672": 92, "1674": 93, "1675": 93, "1676": 93, "168": 26, "16e": 91, "16x": [27, 89], "17": [0, 2, 10, 20, 64, 73, 74, 79, 92, 94], "1706": 82, "1721": 92, "1723": 93, "17233": 20, "173": 26, "1732": 93, "17323": 90, "1738": 93, "174": 74, "1741966075": 88, "1742": 93, "17453": 29, "17453v3": 1, "175": 74, "175b": 23, "176": 73, "176064": 20, "1762": 93, "1799": 93, "17b": 91, "18": [2, 10, 28, 64, 71, 73, 74, 92], "180": [26, 89], "180000000": 0, "180b": [25, 73], "1811": 53, "1815": 93, "181540": 20, "182": 74, "1822": 36, "183": 74, "1834": 93, "184": 74, "185": [22, 73], "1851": 93, "18527": 36, "18533": 53, "18563": 73, "1861": 80, "1866": 80, "1885": 75, "1886": 93, "1889": 53, "1897": 93, "19": [2, 20, 64, 74, 80, 92], "1900": 53, "1909": 93, "191": 74, "192": 21, "1921": 20, "1926": 93, "1937": 93, "1939": 93, "1944": 79, "1953": 93, "1959": 73, "198": 26, "1985": 93, "1987": 93, "1993": 92, "1999": 93, "1_405b": 16, "1_70b": 16, "1b": [30, 33, 35, 38, 40, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 58, 59, 60, 62, 66, 67, 69, 88, 94], "1d": [5, 82, 87], "1e": [15, 82, 83, 84], "1e20f": 1, "1g": 92, "1gb": 2, "1k": [20, 26, 27, 28], "1m": 80, "1st": [22, 82, 89], "1u": [0, 1], "1x": 26, "1xh200": 21, "1ytic": 93, "2": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 19, 21, 22, 23, 25, 26, 27, 28, 30, 42, 44, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 64, 66, 67, 69, 70, 73, 74, 76, 77, 79, 80, 82, 84, 87, 90, 91, 92, 98], "20": [1, 6, 12, 13, 28, 30, 60, 62, 73, 74, 75, 79, 82, 87, 92], "200": [23, 70, 87], "2000": [28, 74], "20000": 74, "200mb": 28, "2017": 79, "2018": 93, "2023": [21, 92], "2024": 26, "2025": [20, 26, 73], "2028": 93, "203": 74, "2033": 80, "2039": 93, "204": [26, 74], "2040": 93, "2044": [79, 80], "2045": 79, "2048": [15, 20, 21, 23, 24, 28, 29, 49, 70, 73, 74, 75, 77, 78, 79, 80, 84, 87, 92, 93], "2056": 93, "206": 74, "20627": 36, "20685": 73, "2079": 92, "208": 74, "2081": [77, 79, 93], "2087": 93, "2089": 74, "209": 74, "20b": 93, "21": [12, 25, 26, 74, 79, 92, 93], "2101": 4, "2102": 74, "2106": 10, "2107": [53, 92], "210g": 19, "211": 26, "2113": 93, "2135": 93, "21367": 53, "2152": 93, "2158": 74, "2168": 20, "2169": 93, "21747": 73, "2176": 74, "21764": 73, "2182": 93, "2191": 93, "22": [28, 32, 74, 82, 92], "22000": 74, "22056": 73, "221": 73, "2210": 90, "2211": [82, 90], "2219": 93, "22213": 73, "2225": 92, "2232": 93, "224": 83, "2243": 93, "2263": 93, "227": 24, "2288": 93, "2294": 93, "22x": 27, "23": [73, 74, 92, 93], "2305": 92, "2306": 90, "2309": [1, 29], "232": 24, "2337": 53, "2352": 93, "2357": 93, "236": 26, "2366": 93, "2370": 93, "2373": 93, "2379": 93, "2388": 93, "239": 26, "2397": 73, "24": [0, 66, 67, 74, 92, 93, 94], "240": 74, "2401": 0, "2402": 10, "24189": 74, "2419": 93, "242": 74, "2425": 93, "2439": 93, "245": 26, "2458": 93, "2461": 79, "2466": 79, "2473": 93, "2474": [77, 79], "2484": 93, "2485": 93, "2487": 74, "249": 26, "25": [24, 26, 73, 74, 91, 93], "250": [20, 26], "2500": 74, "25032": 73, "252u": 28, "253": [26, 74], "2552": 93, "256": [1, 20, 21, 24, 28, 59, 70, 73, 74, 82, 92, 93], "25603": 73, "2573": 93, "2581": [77, 79], "2590780": 73, "259840": 89, "26": [73, 74, 77, 88], "260": 74, "2602": 36, "2628": [79, 80], "263": [21, 36, 53], "2640": 80, "2649": 92, "2671": 20, "2677": 93, "26778": 73, "2679": 77, "2685": 93, "2688": 53, "2691": 93, "27": [74, 93], "270": 74, "2712": 93, "274": [20, 93], "2742": 75, "275": 93, "27556": 53, "276": 74, "278": [36, 53, 74], "2782": 93, "2787": 93, "2796": 93, "28": [26, 73, 74, 92], "2820": 92, "2826700": 20, "28390": 73, "287113": 73, "288": 93, "29": [74, 89], "292": 74, "2939": 92, "294": 74, "297": 36, "29889": 53, "29892": 36, "299": [26, 73], "29962": 36, "2998": 92, "2b": [19, 64, 73], "2cta": 28, "2d": [12, 82, 83, 90], "2k": [20, 26, 27, 28], "2m": 80, "2nd": 82, "2u": 1, "2x": [22, 23], "3": [0, 1, 3, 5, 7, 9, 10, 17, 21, 22, 23, 25, 26, 27, 28, 43, 44, 46, 48, 52, 54, 58, 59, 64, 66, 67, 69, 70, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 87, 88, 92, 93, 94, 95], "30": [0, 12, 20, 26, 70, 74, 75, 77, 80, 82, 89], "300": [24, 73], "3000": [73, 74], "30000": 74, "30065": 73, "3019": 73, "3021": 20, "3022": 73, "303": 23, "3031": 79, "304": [36, 53], "3040": [75, 79, 80], "306": 36, "3072": 20, "30990": 73, "30b": 25, "30x": 25, "31": [11, 74, 75, 79, 80], "311": 74, "3132": 73, "315": [26, 74], "318": 74, "32": [1, 5, 9, 11, 20, 22, 23, 29, 36, 53, 70, 73, 74, 75, 82, 83, 84, 87, 88, 89, 90, 92, 93, 94], "3201": 75, "321": 73, "322": [36, 53], "3276": [75, 79, 80], "32768": 82, "3291": 92, "32b": 93, "32k": 93, "32x": 25, "33": [74, 92], "332": 74, "3328": 92, "3338": 75, "338": [26, 36, 53], "3389": 77, "33x": 27, "34": [20, 74], "340": [26, 74], "341": 23, "3442": 92, "3445": 92, "3452": 92, "3476": 20, "349": 23, "34b": 93, "35": [0, 70, 74], "351": 74, "3555": 92, "35611": 20, "357": 74, "36": [26, 74, 76, 77], "3671": 73, "368": 26, "37": 73, "370": 74, "371": 74, "374": 74, "375": 74, "3763": 26, "379": 74, "38": [73, 74], "384": [20, 74], "3863": 74, "387": 74, "387b12598a9e": 73, "3885": 20, "3887": 92, "39": [26, 74], "3914": 74, "3936": 73, "3977": 92, "399": 74, "3_1": 91, "3_3": 91, "3b": [34, 39, 61], "3d": [5, 82, 87], "3rd": 82, "3u": 1, "3x": [25, 26, 28], "4": [0, 1, 2, 7, 9, 10, 11, 12, 16, 19, 23, 25, 26, 27, 28, 30, 36, 43, 44, 49, 53, 54, 55, 56, 57, 64, 70, 73, 74, 75, 77, 78, 79, 80, 81, 82, 84, 87, 88, 89, 90, 91, 92, 93], "40": [6, 74, 77, 82, 93], "400": 28, "4000": 28, "403": 93, "405": 53, "405b": [73, 76], "4060": 89, "4066": 36, "408": 74, "4089": 80, "4096": [21, 28, 36, 73, 74, 77, 82, 83, 87], "40b": 25, "40gb": 29, "40x": 25, "41": 74, "41020": 73, "411": 73, "4117e": 73, "4133": 80, "41375": 73, "414": 20, "41607": 73, "4168": 20, "4192": 92, "42": [52, 73, 74], "4203099703668305365": 51, "4224": 74, "4248": 77, "4265": 73, "427": [53, 73, 74], "4280": 26, "43": [74, 88, 89], "433": 74, "437": 74, "438": 74, "44": [74, 89], "4408": 36, "442": 74, "4439": 73, "4451": 20, "4456": 74, "447": 74, "448": 74, "449": 93, "4493": [20, 79, 80], "4497": 74, "44x": 25, "45": [9, 74, 91, 93], "450": 74, "45000000000": 9, "453": 74, "4566": 74, "459": 74, "46": 25, "4600": 28, "462": 74, "463": 74, "4653": 36, "4656": 74, "466": 74, "4667": 74, "47": [25, 77], "4701": 73, "471": 74, "472": 36, "475": 74, "477": 74, "478": 93, "47x": 25, "48": [74, 77, 89, 93], "481": [22, 74], "482": 93, "488": 74, "49": [74, 77], "49152": 20, "495": 74, "496": 11, "4963": 73, "49b": 91, "4b": 93, "4bit": 21, "4gb": 28, "4u": 1, "4x": [21, 22, 23], "5": [0, 1, 9, 10, 12, 13, 15, 21, 22, 23, 25, 26, 27, 28, 34, 39, 44, 45, 51, 54, 61, 69, 70, 73, 74, 79, 82, 84, 87, 91, 92, 93], "50": [0, 25, 45, 70, 73, 74, 93], "500": [26, 28, 74], "5000": 74, "500000": 84, "5001": 53, "5007": 36, "500m": 25, "50272": 15, "505143404006958": 30, "5064": 74, "5073": 92, "51": 74, "512": [1, 10, 13, 23, 24, 70, 73, 74, 77, 79, 84], "5120": 20, "512mb": 2, "514": 74, "518": [36, 74], "51b": [91, 93], "51x": 25, "52269": 74, "524": 74, "525": 74, "526": [53, 74, 93], "52667": 74, "529": 74, "5299": 77, "53": [73, 79, 80], "5305": 77, "531": 74, "54": [25, 74], "540": 73, "543": 74, "544": 74, "5496": 77, "5497": 74, "55": [25, 73, 74], "5500": 74, "5510": 73, "5514": 73, "5530": 74, "554": 74, "557": 74, "559": 74, "56": [25, 74], "560": 21, "562": [10, 13], "56401920000": 30, "565": 74, "567": 74, "568": [73, 74], "57": [73, 74], "571": 74, "572": 74, "5739": 20, "5742": [77, 79], "579": 74, "58": [26, 74, 79], "580": 74, "5821": 74, "5830": 92, "5874": 92, "5877": 77, "5879": 92, "588": 74, "58x": 26, "59": 73, "590": [36, 74], "5918": 92, "5942": 20, "5957": 92, "5976": 77, "598": 74, "5980": 77, "5b": 93, "5th": [28, 82], "5u": 1, "5x": [22, 25, 26], "6": [0, 1, 6, 9, 10, 12, 23, 25, 26, 27, 28, 30, 44, 54, 70, 74, 82, 87, 91, 92, 93], "60": [0, 74], "600": 31, "6000": 73, "602": 74, "6049": 77, "6059": 73, "6064": 92, "608": 74, "61": 74, "610": 74, "6100": 20, "6157": 92, "618": 74, "62": [26, 74, 79], "6255": 92, "626": 36, "6299": 92, "63": [43, 44, 54, 65, 73, 74, 79, 84, 89], "630": 74, "63266": 75, "63307": 75, "63308": 75, "63331": 75, "63374": 75, "634": 74, "63456": 75, "6345624": 75, "6372": 77, "639": 93, "64": [0, 1, 5, 6, 15, 20, 22, 23, 29, 34, 39, 58, 61, 74, 79, 82, 83, 84, 89, 93], "640": [21, 74], "640gb": 28, "6452": 80, "6475": 79, "649": 93, "64x": 26, "65": [67, 74], "65024": 92, "6523": 80, "653": 74, "654": 23, "6550": 77, "6554": 79, "656": 74, "657": 74, "659": 74, "6591": 73, "66": [26, 74], "661": 74, "6628": [79, 80], "6678": 89, "6684": 80, "6695": 89, "67": [25, 26, 74], "6701": 20, "671": 20, "67108864": 65, "671b": 27, "673": 93, "675": 73, "6753e": 73, "6769": 79, "679": 22, "68": [25, 26, 74, 80], "682": 74, "6825": 73, "683": 74, "684": 26, "685": 74, "6852": [77, 79], "686": 74, "6862": 73, "6890": 92, "69": [25, 26, 74, 80, 88], "6925": 73, "6938": 36, "695": 93, "696": 74, "697": 28, "6975": 77, "6976": [75, 79, 80], "698": 74, "6a": 21, "6b": [22, 73, 82, 93], "6x": 23, "7": [0, 1, 9, 10, 21, 22, 25, 26, 27, 28, 44, 54, 64, 65, 66, 67, 73, 74, 75, 82, 87, 92], "70": [0, 25, 80, 89], "700": 31, "7000": 73, "701": 93, "7031": 77, "704": 74, "705": [28, 93], "706": 74, "7063": 73, "707": 74, "7072": 74, "709": 73, "7090": 92, "70b": [5, 19, 23, 25, 54, 75, 77, 78, 79, 80, 81, 91, 93], "70g": 19, "71": [26, 73, 74], "711": 74, "712": 74, "7134": 92, "7136": 75, "714": 74, "7144": 92, "7168": [26, 28], "717": 74, "7187": 74, "7188": 20, "72": [74, 76], "722": 74, "727": 74, "72b": [91, 93], "73": [26, 74], "732": 74, "734": 74, "736": 74, "737": 74, "7382": 74, "739": 93, "74": [26, 74], "741": [74, 93], "742": 74, "745": 74, "7456": 20, "74561": 20, "747": 74, "7480": 75, "75": [25, 73, 93], "750": [23, 74], "7502": 75, "7520": 20, "755": 31, "7584": 20, "75903": 74, "76": 74, "7607": 79, "7621": 74, "7638": [75, 79, 80], "767": 74, "768": [15, 83], "77": 74, "772": 74, "7743": 75, "7770": 75, "78": [26, 74, 77], "780": 73, "7842": 77, "78509": 74, "7876": 79, "79": [73, 89], "7900": 92, "7933": 79, "794": [74, 93], "7949": 92, "7977": 77, "7a": 21, "7b": [10, 12, 13, 25, 30, 43, 44, 54, 73, 74, 88, 91, 93], "7x": [22, 26], "8": [0, 1, 5, 9, 10, 11, 15, 16, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 36, 37, 40, 42, 43, 44, 46, 47, 48, 49, 50, 52, 54, 55, 56, 57, 59, 64, 66, 67, 70, 73, 74, 75, 76, 77, 81, 82, 83, 84, 88, 89, 90, 92], "80": [0, 6, 23, 26, 28, 65, 74, 93], "800": [21, 74, 93], "8000": [30, 33, 34, 35, 37, 38, 39, 60, 61, 62, 88], "8002": 73, "8005": 74, "803": 21, "8048": 73, "80gb": [22, 25, 29, 74, 75, 77, 78], "81": [26, 74, 77], "810": 74, "8149": 92, "8179": 92, "819": 23, "8192": [29, 70, 73, 74, 75, 79, 82, 83, 92, 93], "82": [26, 74, 77], "820": 73, "8212": 1, "8218": 92, "822": 74, "8225": 77, "825": 93, "8259": 73, "83": 74, "8307": 80, "8351": 73, "838": 74, "84": [26, 74], "840": 74, "841": 74, "8441": 73, "85": [20, 25, 73, 74, 93], "850": 74, "851": 74, "854": 74, "86": [65, 74], "863": 73, "866": 74, "867": 74, "8672": 92, "87": [25, 74], "8779": 92, "88": [74, 77, 80], "8804": 75, "8828": 92, "8841": 77, "89": [25, 26, 65, 74, 91], "893": 74, "8932": 73, "8958": 80, "896": [53, 74], "8a": 24, "8b": [46, 54, 69, 73, 88, 91, 94], "8bit": 22, "8tb": 23, "8x": 28, "8x7b": [4, 73, 91, 93], "8xb200": 26, "8xgpu": 28, "8xh100": 24, "8xh200": 21, "9": [0, 1, 10, 12, 19, 22, 26, 27, 44, 54, 59, 64, 74, 77, 82, 92], "90": [0, 20, 65, 70, 73, 74, 75, 77, 81, 89], "9007": 20, "9028": 92, "907": 22, "9087": 80, "91": 74, "910": 74, "9101": 74, "911": 74, "9115": 80, "912656": 20, "913": 74, "9184": 77, "92": [26, 74], "920": 74, "9203": 77, "9214": 74, "924": 15, "925": 74, "9274": 75, "93": 74, "935": 93, "9353e": 75, "9379": 20, "94": 74, "94022": 74, "941": [21, 24], "943": 53, "944": 74, "946": 21, "947": 74, "9494": 79, "95": [30, 37, 40, 42, 43, 44, 46, 47, 48, 49, 50, 52, 54, 59, 66, 67, 74, 75, 81, 88], "9521": 92, "953": 74, "9537": 77, "954": 28, "956": 74, "957": 74, "96": [21, 26, 28, 74, 77, 93], "960": 21, "9606": 28, "961": 74, "9613": 28, "9623": 79, "9629": 28, "963": 74, "9639": 74, "96583": 74, "967": 93, "9692": 92, "97": [28, 73, 74, 77], "970": 74, "98": 74, "983": 93, "987": 93, "99": [9, 26, 31, 74], "990": 74, "991": 74, "992": 93, "9928": 80, "9938": 20, "9982": [79, 80], "9x": [23, 24], "A": [0, 1, 2, 3, 5, 6, 8, 10, 12, 15, 16, 19, 20, 25, 26, 52, 55, 56, 57, 58, 70, 72, 73, 74, 82, 87, 93, 95, 97], "AND": 82, "And": [12, 19, 27, 28, 82, 83, 89], "As": [4, 5, 7, 10, 12, 16, 18, 27, 36, 77, 80, 81, 82, 89, 90, 92, 97, 98], "At": [14, 28, 58, 77, 83, 89], "But": [5, 8, 71], "By": [0, 1, 2, 6, 12, 26, 28, 36, 65, 70, 73, 77, 80, 82, 92, 97], "For": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 36, 40, 52, 55, 56, 57, 63, 65, 69, 73, 74, 75, 76, 77, 79, 80, 81, 82, 87, 88, 89, 92, 93, 95, 96, 97, 98, 99], "If": [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 15, 16, 17, 19, 25, 27, 29, 30, 31, 32, 65, 66, 67, 69, 70, 71, 73, 75, 76, 77, 79, 80, 81, 82, 84, 87, 89, 91, 92, 93, 95, 97, 98, 99], "In": [0, 1, 2, 7, 8, 11, 12, 16, 17, 19, 20, 22, 25, 26, 27, 28, 32, 36, 54, 58, 64, 65, 73, 74, 75, 76, 77, 79, 80, 82, 88, 89, 90, 91, 92, 93, 97, 98, 99], "It": [0, 1, 3, 5, 6, 7, 10, 12, 14, 16, 17, 18, 20, 21, 24, 25, 26, 27, 28, 29, 36, 51, 58, 65, 70, 71, 73, 74, 77, 78, 79, 80, 81, 82, 88, 90, 92, 95, 96, 97, 99], "Its": [5, 82, 97], "NOT": 82, "No": [0, 2, 9, 58, 73, 75], "Not": [1, 25], "ON": [73, 77, 79, 80], "OR": 82, "Of": [26, 93], "On": [5, 9, 65, 67, 72, 76, 80, 82, 93], "One": [2, 15, 16, 79, 82, 92, 96], "Or": [82, 87, 94], "That": [3, 5, 6, 9, 16, 71, 77, 82], "The": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 54, 55, 56, 57, 58, 59, 64, 65, 66, 67, 69, 70, 72, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 87, 88, 89, 91, 92, 93, 94, 95, 96, 97, 98, 99], "Their": 28, "Then": [10, 19, 27, 30, 31, 73, 75, 82, 95, 98], "There": [2, 5, 6, 7, 8, 9, 10, 15, 19, 23, 26, 27, 28, 36, 65, 67, 69, 82, 85, 89, 90, 92, 93, 96, 97, 98, 99], "These": [2, 12, 19, 21, 23, 24, 26, 28, 36, 73, 75, 76, 83, 85, 88, 93], "To": [2, 3, 5, 9, 10, 12, 13, 16, 17, 18, 19, 20, 23, 26, 27, 65, 69, 70, 71, 72, 73, 74, 77, 79, 80, 81, 82, 88, 89, 90, 93, 94, 95, 97, 98, 99], "Will": 0, "With": [5, 6, 12, 16, 31, 36, 49, 64, 73], "_": [0, 3, 17, 85], "__all__": 95, "__call__": 52, "__init__": [7, 14, 16, 17, 52, 70, 73, 92, 93, 95, 97, 99], "__main__": [40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 58, 59, 66, 67, 69, 75, 77, 80, 81, 88, 93, 94, 95], "__name__": [40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 58, 59, 66, 67, 75, 77, 80, 81, 88, 93, 94, 95], "__post_init__": 93, "__repr__": 93, "_capac": 1, "_context_logits_auto_en": 70, "_cpp_gen": 3, "_create_tensor": 16, "_explicitly_disable_gemm_plugin": 85, "_generation_logits_auto_en": 70, "_handl": 1, "_mark_output": 92, "_mpi_sess": 70, "_note": 5, "_num_postprocess_work": 70, "_path": 20, "_postproc_param": 70, "_postprocess_result": 70, "_postprocess_tokenizer_dir": 70, "_reasoning_pars": 70, "_return_log_prob": 70, "_run": 92, "_runtim": 87, "_str_to_trt_dtype_dict": 82, "_torch": [73, 93, 94, 95, 96, 97], "_torchllmargs__context": 70, "_trtllmargs__context": 70, "_unsign": 1, "_util": 82, "a10": 29, "a100": [6, 19, 29], "a10g": 29, "a2": 93, "a30": 29, "a40": 29, "a8": 90, "a_": 82, "a_1": 82, "a_2": 82, "a_n": 82, "a_sf": 82, "aarch64": 91, "ab": [10, 29, 82, 90], "abbrevi": 30, "abc": 27, "abcd": 27, "abi": [65, 93], "abil": [71, 73], "abl": [5, 22, 26, 67, 73, 79, 82, 93], "ablat": [27, 28], "abnorm": 93, "abort": [70, 93], "about": [0, 1, 3, 19, 20, 21, 22, 24, 25, 28, 51, 58, 59, 64, 73, 75, 77, 78, 80, 82, 88, 89, 92, 93], "abov": [2, 10, 11, 16, 19, 20, 25, 28, 36, 65, 73, 74, 75, 77, 80, 89], "absenc": 6, "absorb": 26, "abstract": [80, 83], "ac": 93, "acc": 82, "acceler": [5, 11, 12, 22, 23, 24, 25, 29, 71], "accept": [0, 1, 12, 20, 36, 46, 47, 48, 49, 50, 65, 70, 75, 77, 82, 87, 88, 91, 93, 97], "accept_length": 87, "acceptancelength": 0, "acceptancer": 0, "acceptancethreshold": 0, "acceptedlen": 1, "acceptedlengthscumsum": 1, "acceptedpath": 1, "acceptedpathid": 1, "acceptedtoken": 1, "acceptedtokenslen": 1, "access": [3, 32, 45, 70, 73, 75, 82, 88, 93], "accessor": 1, "accommod": [4, 96, 98], "accomplish": 76, "accord": [5, 17, 59, 82, 83, 97], "accordingli": 17, "account": [16, 20, 31, 55, 56, 57, 65], "accumul": [0, 5, 6, 29, 70, 82, 87, 88], "accur": [21, 27, 45, 73, 75, 93], "accuraci": [21, 26, 28, 29, 77, 81, 82, 90, 93], "achiev": [2, 12, 20, 21, 25, 26, 28, 65, 74, 75, 77, 79, 81, 95], "across": [2, 4, 5, 6, 7, 16, 17, 23, 26, 30, 74, 76, 77, 79, 80, 82, 87], "act": 26, "act_fn": 83, "act_typ": [16, 82], "action": 54, "activ": [0, 1, 5, 7, 16, 20, 21, 22, 25, 26, 28, 29, 76, 82, 90, 91, 93, 99], "activation_scaling_factor": 15, "activationtyp": [16, 82], "active_request": 99, "actual": [7, 8, 12, 20, 25, 26, 27, 29, 77, 79, 80, 81, 93, 98], "ad": [1, 5, 6, 7, 9, 12, 13, 19, 27, 28, 32, 52, 64, 72, 76, 79, 80, 82, 84, 87, 93, 94, 96], "ada": [5, 25, 59, 65, 71, 77, 91, 93], "adalayernorm": 83, "adalayernormcontinu": 83, "adalayernormzero": 83, "adalayernormzerosingl": 83, "adapt": [0, 10, 27, 40, 41, 70, 82, 83, 93, 95], "adapter_s": 10, "adapters": 1, "add": [1, 3, 5, 7, 10, 14, 15, 16, 19, 27, 31, 32, 52, 54, 65, 69, 70, 73, 75, 77, 80, 82, 87, 92, 93, 95, 98], "add_activ": 16, "add_argu": 54, "add_bias_linear": 84, "add_generation_prompt": 26, "add_input": 82, "add_output": 82, "add_padding_request": 98, "add_qkv_bia": 84, "add_rmsnorm": 26, "add_sequ": 87, "add_special_token": [26, 70, 87, 93], "addcumlogprob": 93, "added_kv_proj_dim": 83, "added_proj_bia": 83, "addit": [0, 5, 6, 10, 12, 16, 19, 23, 27, 28, 30, 36, 45, 65, 70, 73, 74, 76, 77, 79, 82, 83, 90, 91, 92, 93, 97, 98], "addition": [2, 73, 75, 77, 80, 95, 97], "additional_model_output": 70, "additional_opt": 57, "additionalmodeloutput": [0, 3, 70], "additionaloutput": [0, 3], "addr": 0, "address": [1, 17, 20, 25, 26, 28, 69, 80, 89, 93], "addresswiths": 1, "adequ": 83, "adher": 45, "adjust": [55, 70, 73, 75, 89, 99], "admin": 67, "adopt": [6, 19], "advanc": [12, 16, 24, 26, 27, 28, 29, 42, 46, 47, 49, 50, 65, 70, 82, 93, 97], "advantag": [6, 71], "advers": [21, 29], "advertis": 73, "advis": 2, "affect": [11, 19, 20, 29, 75, 77, 79, 80, 89], "affin": 83, "after": [0, 1, 3, 5, 7, 8, 9, 10, 12, 16, 17, 26, 27, 28, 29, 30, 31, 51, 54, 65, 69, 70, 73, 77, 79, 80, 81, 82, 83, 85, 88, 89, 93, 97, 99], "again": [16, 75, 77, 80, 92], "against": [65, 73], "agent": 23, "agentdesc": 0, "agentnam": 0, "agentst": 0, "aggreg": 28, "aggress": [15, 27, 77, 81], "agre": [69, 88], "ahead": [0, 5, 12], "ai": [20, 22, 26, 30, 37, 40, 42, 43, 44, 46, 47, 48, 49, 50, 54, 59, 66, 67, 71, 72, 75, 81, 82, 88, 91, 93, 94], "aidc": 93, "aim": [4, 15, 20, 26, 71, 73, 75, 77, 93], "ainsli": 21, "air": 93, "aka": 82, "akhoroshev": 93, "al": 21, "albeit": 12, "alessionetti": 93, "algorithm": [0, 5, 6, 12, 15, 16, 19, 25, 26, 27, 28, 70, 73, 77, 82, 93], "alia": [70, 83, 84], "alibi": 82, "alibi_bias_max": [82, 83], "alibi_scal": 82, "alibi_slop": 82, "alibi_with_scal": 82, "align": [73, 93, 99], "align_corn": 82, "all": [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 16, 17, 19, 20, 23, 26, 27, 28, 52, 55, 56, 57, 58, 65, 70, 71, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 85, 87, 88, 89, 90, 91, 92, 93, 97, 98, 99], "all_reduce_param": [82, 83], "allbitset": [0, 1], "allgath": [16, 28, 29, 80, 82, 93], "allgeneratedtoken": 0, "alllayersdrafttokenid": 1, "alllayersdrafttokenidspredecessor": 1, "alllayersscor": 1, "alloc": [0, 1, 2, 5, 8, 9, 30, 36, 70, 81, 82, 87, 89, 92, 93, 96, 97, 98, 99], "allocateipcmemori": 1, "allocatespeculativedecodingbuff": 1, "allocnewblock": 0, "allocnewblocksperrequest": 0, "alloctotalblock": 0, "alloctotalblocksperrequest": 0, "allot": 0, "allottedtimem": [0, 93], "allow": [0, 1, 2, 3, 5, 6, 9, 12, 15, 21, 24, 28, 29, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79, 80, 82, 85, 92, 93, 96, 99], "allowed_token_id": 52, "allreduc": [16, 26, 28, 29, 80, 82, 93], "allreducebuff": 1, "allreducefusionkernel": 26, "allreducefusionop": 82, "allreduceparam": [82, 83], "allreducestrategi": [11, 82], "almost": [16, 28, 77, 79, 89], "alon": 4, "along": [5, 12, 18, 65, 82, 93], "alpaca": 10, "alpha": [70, 82, 83, 93], "alphabet": 82, "alreadi": [0, 5, 7, 9, 18, 20, 26, 27, 28, 70, 77, 79, 81, 82, 93, 95, 98], "also": [0, 2, 3, 5, 7, 12, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26, 27, 28, 29, 30, 36, 49, 51, 52, 65, 69, 70, 73, 74, 75, 76, 77, 78, 79, 82, 83, 88, 89, 90, 93, 95, 96, 97, 98], "altair": 93, "alter": [3, 7], "altern": [3, 26, 52, 65, 95, 96], "although": [7, 16, 73, 77, 80], "alwai": [0, 1, 3, 5, 6, 9, 15, 16, 19, 28, 53, 70, 79, 80, 82, 92], "always_share_across_beam": 87, "am": [42, 46, 47, 49, 50, 52, 59, 75, 81, 87], "ambigu": 1, "amd": 93, "amen": [0, 3, 70], "among": [32, 82], "amongst": 82, "amount": [0, 9, 16, 28, 29, 70, 73, 79, 81, 87, 89, 92], "amper": [22, 65, 71, 91, 93], "an": [0, 1, 2, 3, 5, 6, 7, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 21, 23, 25, 26, 27, 28, 29, 30, 36, 42, 45, 46, 47, 48, 49, 50, 52, 59, 65, 67, 69, 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 87, 88, 89, 90, 92, 93, 95, 96, 97, 98, 99], "analysi": [7, 26, 27, 28, 64, 89], "analysispatternmanag": 7, "analyt": 22, "analyz": [7, 75], "ani": [0, 1, 2, 3, 7, 8, 12, 17, 19, 20, 28, 30, 52, 65, 69, 70, 71, 73, 74, 79, 80, 81, 82, 84, 87, 92, 95, 96, 97], "announc": [20, 21, 22, 24], "anoth": [0, 1, 5, 7, 10, 19, 22, 26, 27, 28, 30, 79, 82, 92, 97, 99], "answer": [27, 45], "antialia": 82, "antonin": [42, 46, 47, 49, 50], "anybitset": [0, 1], "anyth": [58, 74], "aotman": 93, "apart": 36, "api": [2, 6, 9, 12, 14, 15, 16, 18, 20, 27, 28, 36, 37, 49, 55, 56, 57, 64, 65, 71, 72, 73, 74, 77, 78, 80, 81, 82, 89, 92, 94], "api_kei": [30, 60, 61, 62], "app": [65, 93], "appar": 71, "appear": [0, 5, 6, 51, 67, 70, 82, 92, 93], "append": [27, 52, 59, 72, 82, 99], "append_paged_kv_cach": 97, "appl": 93, "appli": [0, 2, 3, 5, 7, 10, 12, 15, 16, 17, 26, 27, 28, 29, 65, 70, 71, 73, 82, 83, 87, 90, 93, 97], "applic": [9, 12, 22, 25, 26, 28, 30, 33, 34, 35, 67, 69, 71, 72, 88, 92, 93, 99], "apply_batched_logits_processor": [52, 70], "apply_chat_templ": [26, 45], "apply_llama3_sc": 82, "apply_query_key_layer_sc": [83, 84], "apply_residual_connection_post_layernorm": 84, "apply_rotary_pos_emb": 82, "apply_rotary_pos_emb_chatglm": 82, "apply_rotary_pos_emb_cogvlm": 82, "apply_silu": 82, "applybiasropeupdatekvcach": 93, "applyrop": 26, "appreci": 28, "approach": [0, 2, 4, 7, 9, 11, 12, 26, 27, 28, 69, 73, 81], "appropri": [25, 36, 92], "approv": 52, "approxim": [28, 65, 83], "apt": [20, 31, 65, 66, 67], "ar": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 33, 34, 42, 45, 46, 47, 49, 50, 52, 53, 54, 55, 56, 57, 58, 60, 61, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99], "arang": 82, "arbitrag": 73, "arbitrari": [17, 93], "arbitrary_types_allow": 70, "architectur": [2, 4, 6, 9, 15, 22, 27, 28, 65, 71, 84, 87, 91, 93, 94], "arctic": [91, 93], "area": [28, 59], "aresult": 36, "arg": [0, 7, 19, 30, 54, 70, 83, 84, 87, 93], "arglist": 7, "argmax": 82, "argpars": 54, "argument": [2, 3, 20, 30, 36, 49, 52, 65, 69, 70, 73, 76, 82, 89, 93, 97], "argumentpars": 54, "aris": 65, "arithmet": 16, "armor": 51, "around": [1, 15, 19, 71, 75, 80], "arrai": [0, 1, 70, 82, 87], "arrayview": [0, 1], "arriv": [0, 4], "arrivaltim": 0, "arrow": 82, "art": [20, 26], "articl": [5, 12, 26, 27], "artifici": 71, "artist": 59, "arxiv": [0, 1, 4, 10, 29, 82, 90], "as_dtyp": 82, "as_lay": 7, "as_shap": 82, "ascii": 82, "asciichar": 1, "ask": [51, 58, 92], "aspect": 5, "assembl": [16, 18], "assert": [7, 82, 92, 93, 99], "assert_valid_quant_algo": 84, "assign": [0, 2, 19, 83, 85, 95], "assist": [6, 30, 33, 34, 45, 60, 61, 69, 88], "assistant_model": 6, "associ": [1, 3, 4, 10, 28, 65, 75, 82], "asssembl": 12, "assum": [1, 3, 9, 10, 12, 13, 20, 27, 28, 70, 73, 82, 84, 87], "assumpt": [12, 29], "async": [36, 47, 48, 70, 73, 87], "asynchron": [1, 3, 36, 40, 41, 70], "asyncio": [47, 48], "asyncllmengin": 93, "atom": 1, "attach": [2, 20], "attempt": [0, 2, 74, 75, 77], "attend": 81, "attent": [0, 1, 2, 6, 8, 9, 10, 12, 14, 16, 17, 20, 21, 29, 64, 70, 82, 87, 88, 89, 92, 93, 94, 95, 98], "attention_backend": [95, 97], "attention_head_s": [82, 83], "attention_mask": [82, 83, 84, 87, 97], "attention_mask_param": 84, "attention_mask_typ": 83, "attention_multipli": 84, "attention_output": 92, "attention_output_orig_quant_scal": 82, "attention_output_sf_scal": 82, "attention_packed_mask": [82, 83], "attention_param": [83, 84], "attention_qk_half_accumul": 93, "attention_window_s": 8, "attentionconfig": 0, "attentionheads": 1, "attentionmask": 97, "attentionmaskparam": 83, "attentionmasktyp": [82, 83], "attentionmetadata": 95, "attentionparam": [83, 84], "attentiontyp": 0, "attn_backend": [70, 97], "attn_bia": 84, "attn_dens": [10, 29], "attn_forward_funcnam": 83, "attn_k": [10, 29], "attn_logit_softcap": 84, "attn_logit_softcapping_scal": 82, "attn_metadata": 95, "attn_processor": 84, "attn_q": [10, 29], "attn_qkv": [10, 29], "attn_v": [10, 29], "attribut": [0, 1, 3, 7, 17, 19, 87], "audio": [87, 93], "audio_engine_dir": 87, "audio_featur": 87, "audio_path": 87, "authent": [69, 75, 88], "authorized_kei": [31, 32], "auto": [0, 1, 2, 3, 5, 6, 11, 13, 16, 42, 51, 70, 73, 80, 82, 84, 85, 86, 93], "auto_deploi": 93, "auto_deploy_config": 70, "auto_parallel": [29, 42, 70, 93], "auto_parallel_config": 70, "auto_parallel_world_s": [42, 70], "auto_quantize_bit": 86, "autoawq": 93, "autodeploi": 93, "autogptq": 93, "autom": [45, 93], "automat": [0, 3, 7, 11, 16, 17, 26, 30, 36, 40, 41, 69, 71, 73, 75, 82, 89, 90, 93], "autoparallelconfig": 70, "autopp": 93, "autoq": 93, "autoregress": [0, 12, 97, 98], "autotoken": 36, "autotun": [70, 93], "autotuner_en": [51, 70], "aux": 89, "auxiliari": 12, "avaiable_block": 99, "avail": [0, 1, 3, 7, 9, 16, 21, 23, 30, 36, 42, 46, 47, 49, 50, 52, 65, 71, 73, 79, 80, 81, 87, 88, 89, 90, 93, 94, 97, 98], "averag": [0, 12, 20, 27, 70, 73, 74, 75, 77, 79, 80], "avg": [73, 75, 82], "avg_pool2d": 82, "avgnumdecodedtokensperit": 0, "avgpool2d": 83, "avoid": [1, 2, 19, 26, 27, 28, 65, 69, 87, 89, 93], "awai": [79, 80], "await": [0, 3, 36, 47, 48], "awaitcontextrespons": 0, "awaitgenerationrespons": 0, "awaitrespons": [0, 2, 3], "awar": [2, 5, 21, 92], "awq": [25, 36, 59, 64, 91, 93], "awq_block_s": 86, "ax": 82, "axi": [24, 82], "b": [1, 2, 7, 10, 16, 21, 22, 23, 24, 72, 82, 84, 87, 93], "b200": [27, 28, 74, 93], "b_sf": 82, "back": [0, 2, 9, 11, 12, 46, 49, 67, 74, 93], "backbon": 71, "backend": [0, 2, 3, 12, 16, 18, 20, 27, 28, 30, 37, 45, 51, 52, 55, 56, 57, 64, 70, 72, 73, 74, 88, 93, 96, 98, 99], "backend_token": [0, 3], "backendagentdesc": 0, "backu": [0, 3, 70], "backward": 19, "bad": [0, 3, 70, 93], "bad_token_id": 70, "bad_words_data": 87, "bad_words_list": 87, "badword": 0, "badwordslen": 1, "badwordslist": 1, "badwordsptr": 1, "baichuan": [69, 90, 91, 93], "baichuan2": 91, "baichuanconfig": 84, "baichuanforcausallm": 84, "balanc": [4, 6, 12, 16, 28, 70, 79, 81], "band": 45, "bandwidth": [6, 16, 21, 22, 23, 25, 28, 45], "bangbang": 22, "bantoken": 0, "banword": 0, "bar": 70, "bare": [93, 94], "barissglc": 58, "barnardo": 51, "bart": [91, 93], "base": [0, 1, 2, 3, 9, 10, 11, 12, 14, 17, 18, 19, 20, 21, 22, 25, 26, 28, 29, 47, 48, 54, 65, 70, 71, 73, 79, 81, 82, 83, 84, 85, 86, 87, 89, 91, 93, 94, 95, 96, 98, 99], "base64": 61, "base_model": 10, "base_s": 83, "base_url": [30, 60, 61, 62], "baseagentconfig": 0, "basekvcachemanag": 0, "baselin": [25, 26, 27, 28, 75, 79, 80, 97], "baseline_fp8_engin": 77, "basellmarg": 70, "basemodel": 70, "baseresourcemanag": [96, 98], "basetransferag": 0, "bash": [16, 30, 32, 33, 34, 35, 37, 38, 39, 55, 56, 57, 72], "basic": [14, 72, 82], "basic_string_view": 0, "batch": [0, 1, 6, 9, 10, 11, 12, 13, 16, 18, 20, 22, 23, 25, 26, 27, 28, 29, 30, 52, 64, 68, 70, 73, 74, 75, 77, 78, 80, 81, 82, 83, 87, 88, 89, 92, 93, 95, 96, 97, 98, 99], "batch_beam_s": [5, 82], "batch_dim": 82, "batch_idx": 87, "batch_input_id": 87, "batch_manag": [0, 1, 98], "batch_schedul": 93, "batch_siz": [5, 7, 13, 15, 21, 24, 82, 83, 86, 87, 89, 97], "batchdon": 1, "batched_logits_processor": [52, 70], "batchedlogitsprocessor": [52, 70], "batchidx": 1, "batchindex": 1, "batching_typ": 70, "batchingtyp": [0, 70], "batchsiz": [0, 1, 6, 22], "batchsizelimit": 0, "batchsizet": 0, "batchslot": 1, "batchslotshostcopi": 1, "batchslotsrequestord": 1, "bc": 82, "beam": [0, 1, 6, 12, 18, 24, 29, 30, 36, 49, 64, 70, 82, 87, 89, 92, 93], "beam_search_diversity_r": [70, 87], "beam_width": [5, 6, 36, 82, 87, 93], "beam_width_arrai": 70, "beamhypothes": 1, "beamsearch": 0, "beamsearchbuff": 1, "beamsearchdiversityr": [0, 1, 6], "beamsiz": 0, "beamtoken": [0, 3], "beamwidth": [0, 1, 2, 3, 6, 70, 93], "beamwidtharrai": [0, 1, 6], "becam": 0, "becaus": [0, 3, 9, 20, 25, 26, 27, 28, 29, 36, 53, 58, 69, 73, 74, 75, 76, 77, 79, 81, 82, 89], "becom": [5, 6, 7, 9, 10, 16, 17, 25, 26, 28, 51, 71], "been": [0, 3, 4, 5, 19, 22, 23, 26, 28, 32, 54, 58, 65, 67, 70, 73, 77, 79, 82, 92, 93], "befor": [0, 1, 2, 3, 5, 7, 9, 10, 11, 15, 16, 17, 26, 27, 55, 56, 57, 64, 65, 67, 70, 71, 72, 76, 77, 79, 81, 82, 84, 87, 89, 92, 93, 95, 96, 97, 98, 99], "beforehand": 75, "begin": [12, 69, 71, 76, 93, 95], "behav": [0, 89], "behavior": [2, 5, 74, 79, 82, 87, 89, 93], "behaviour": [0, 82], "behind": [22, 28], "being": [0, 5, 9, 16, 19, 28, 58, 70, 79, 92, 93, 97], "believ": [51, 73], "belong": 79, "below": [0, 5, 6, 7, 8, 10, 20, 23, 24, 25, 27, 28, 31, 32, 73, 74, 77, 79, 80, 92], "bench": [20, 27, 40, 41, 58, 73, 74, 78, 93], "benchmark": [26, 27, 56, 64, 65, 72, 77, 78, 80, 88, 93], "benchmark_2nod": 30, "benefici": [28, 73, 79, 80], "benefit": [7, 9, 11, 23, 25, 27, 28, 29, 71, 79, 93], "bert": [29, 82, 90, 91, 93], "bert_attent": 82, "bert_attention_plugin": 29, "bert_context_fmha_fp32_acc": 29, "bertattent": 83, "bertattentionplugin": 82, "bertbas": 84, "bertforquestionansw": 84, "bertforsequenceclassif": [84, 91], "bertmodel": 84, "besid": 96, "best": [5, 16, 26, 27, 28, 53, 70, 72, 73, 76, 78, 79, 88, 93], "best_of": [70, 93], "best_path": 87, "best_path_len": 87, "best_path_length": 87, "best_perf_practice_on_deepseek": [26, 93], "bestpathindic": 1, "bestpathlength": 1, "beta": [30, 82], "beta_fast": 82, "beta_slow": 82, "better": [0, 2, 5, 6, 9, 11, 17, 19, 24, 26, 27, 28, 29, 55, 56, 57, 70, 74, 76, 77, 80, 81, 93], "between": [0, 2, 5, 6, 8, 9, 12, 16, 17, 19, 26, 27, 28, 34, 61, 67, 70, 72, 74, 76, 80, 81, 82, 83, 89, 92, 93, 95], "beyond": [1, 22, 77], "bf16": [5, 11, 17, 19, 26, 28, 64, 77, 80, 91, 93], "bfloat16": [5, 16, 29, 73, 75, 85, 90, 91, 93], "bhuvanesh09": 93, "bi": 5, "bia": [0, 3, 15, 16, 28, 70, 82, 83, 84, 93], "bias": [15, 82], "bidirect": [82, 83], "bidirectionalglm": 82, "bigger": 9, "biggest": 9, "billion": 20, "bin": [15, 16, 17, 20, 30, 33, 34, 35, 37, 38, 39, 55, 56, 57, 72, 92, 93], "binari": [12, 16, 72, 82], "bind": [52, 64, 70, 81, 87, 89, 93, 96, 98, 99], "bindcapacityschedul": 99, "bit": [0, 1, 5, 22, 58, 82, 90], "bitmask": 93, "bl": [12, 84], "black": 7, "blackwel": [2, 20, 27, 59, 64, 67, 76, 77, 91, 93], "blip": [90, 93], "blip2": [90, 91, 93], "blob": 26, "block": [0, 1, 2, 5, 6, 9, 16, 28, 29, 36, 51, 52, 64, 69, 70, 79, 82, 87, 89, 93, 98], "block_controlnet_hidden_st": 84, "block_hash": 51, "block_num": 82, "block_siz": [82, 83, 87], "block_sparse_block_s": 82, "block_sparse_homo_head_pattern": 82, "block_sparse_num_local_block": 82, "block_sparse_param": 83, "block_sparse_vertical_strid": 82, "blockhash": 0, "blockidx": 1, "blockptr": 1, "blocksiz": 0, "blockspars": 82, "blocksparseattnparam": 83, "blog": [20, 21, 24, 25, 26, 27, 28, 93], "bloodeagle40234": 93, "bloom": [6, 17, 90, 91, 93], "bloom_dict": 17, "bloomforcausallm": 84, "bloommodel": 84, "bm": 1, "bmm": 16, "board": 80, "bodi": 16, "book": 58, "bool": [0, 1, 7, 13, 15, 70, 82, 83, 84, 85, 87, 97], "boolean": [1, 3, 10, 82, 84, 85], "boost": [20, 26, 28, 77, 79, 80], "born": [14, 16, 92], "borrow": [36, 49, 73], "bos_token_id": 87, "both": [0, 2, 4, 5, 7, 8, 10, 12, 16, 17, 20, 22, 25, 26, 27, 28, 29, 40, 54, 70, 73, 74, 76, 79, 81, 82, 83, 89, 90, 93, 96, 97], "bottleneck": [4, 11, 20, 25, 76, 79], "bottom": 32, "bound": [0, 6, 14, 16, 23, 26, 27, 28, 70, 73, 82, 87, 89], "boundari": [6, 16, 28, 70, 82, 84, 86, 89], "box": [7, 20], "bpru": 93, "brahma": 73, "branch": [12, 21, 24, 70], "breadth": 12, "break": [12, 26, 69, 73, 80, 93, 99], "breakdown": [72, 73, 74, 75], "breviti": 20, "brief": [84, 87, 97], "briefli": [34, 61], "brife": 0, "bring": [25, 26, 27, 28, 95], "broadcast": [3, 26, 82], "broadcast_help": 82, "broader": [5, 93], "broadli": 28, "broken": [71, 79, 93], "bsz": 83, "bu": 65, "budget": [13, 79], "buffer": [0, 1, 2, 3, 8, 9, 29, 30, 64, 70, 82, 93, 98], "buffer_0": 1, "buffer_1": 1, "buffer_2": 1, "buffer_alloc": 87, "buffercast": 1, "buffercastornul": 1, "bufferdatatyp": 1, "buffermanag": 89, "buffermanagertest": 1, "bufferptr": 1, "bufferrang": 1, "buffers": 1, "bufferview": 0, "bug": [28, 93], "build": [2, 3, 5, 6, 7, 9, 10, 12, 13, 14, 16, 18, 49, 51, 53, 54, 58, 64, 69, 70, 71, 72, 76, 77, 78, 79, 81, 84, 85, 88, 89, 92, 93], "build_cach": 70, "build_config": [19, 29, 36, 49, 53, 54, 58, 70, 77, 79, 80, 84], "build_dir": 65, "build_engin": 16, "build_flags_multiple_profil": 80, "build_serialized_network": 16, "build_wheel": [20, 65, 72], "buildcacheconfig": 70, "buildconfig": [13, 19, 36, 49, 53, 54, 58, 70, 77, 79, 80, 93], "builder": [13, 16, 19, 70, 93], "builder_force_num_profil": 93, "builder_opt": 93, "built": [3, 6, 9, 16, 19, 28, 29, 59, 65, 67, 69, 70, 73, 74, 75, 80, 81, 82, 88, 89, 92, 93], "bump": 1, "bumptaskinprogress": 1, "burden": 76, "busi": 0, "button": 93, "buvnswrn": 93, "bw": 93, "byt5": [91, 93], "byte": [0, 1, 11, 70, 87], "bytestostr": 1, "c": [0, 1, 2, 5, 7, 12, 16, 18, 20, 27, 28, 30, 31, 32, 36, 55, 56, 57, 64, 70, 71, 72, 79, 82, 84, 88, 93, 96, 98, 99], "cach": [0, 1, 2, 3, 6, 10, 16, 19, 25, 26, 27, 28, 29, 30, 36, 40, 41, 43, 44, 54, 64, 68, 70, 71, 73, 74, 75, 79, 82, 87, 88, 90, 93, 94, 95, 96, 97, 99], "cache_indir": 87, "cache_indir_t": 82, "cache_indirect": [5, 82, 83, 87, 92], "cache_root": 70, "cache_transceiver_config": 70, "cachehitr": 0, "cacheindirect": 1, "cachelevel": 0, "cachelevelupd": 0, "caches": 0, "cachest": 0, "cachetransceiv": 0, "cachetransceiverconfig": [0, 70], "cachetyp": 98, "cachevalu": 1, "calcul": [0, 21, 22, 24, 27, 28, 70, 73, 81, 82, 87, 89, 93], "calculate_speculative_resourc": 70, "calculatespeculativeresourc": 0, "calculatespeculativeresourcetupl": 0, "calib_batch": [59, 70, 77, 84], "calib_batch_s": [70, 77, 84], "calib_config": [59, 70, 77], "calib_dataset": [59, 70, 84, 86], "calib_max_seq_length": [59, 70, 77, 84, 86], "calib_s": [73, 86], "calibconfig": [59, 70, 77], "calibr": [17, 25, 28, 29, 59, 70, 77, 93], "call": [0, 1, 3, 4, 5, 6, 7, 16, 17, 19, 27, 28, 36, 52, 70, 72, 75, 77, 82, 84, 86, 87, 88, 89, 93, 95, 96, 97, 98], "callabl": [17, 52, 70, 84], "callback": [3, 52, 70], "can": [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 36, 40, 43, 44, 46, 49, 52, 53, 54, 55, 56, 57, 58, 59, 64, 65, 67, 69, 70, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], "canaccessp": 1, "cancel": [0, 3, 70, 73, 93], "cancelrequest": [0, 3], "candid": [0, 6, 12, 16, 26, 27, 70], "canenqueu": 0, "canenqueuerequest": 0, "cannon": 51, "cannot": [1, 6, 11, 16, 17, 26, 28, 69, 70, 79, 80, 81, 82, 89, 92, 93, 99], "cap": 75, "capabl": [21, 26, 45, 65, 71, 72, 77], "capac": [0, 1, 21, 23, 25, 70, 99], "capacitor_schedul": 99, "capacity_scheduler_polici": [70, 81], "capacityschedul": [96, 98, 99], "capacityschedulerpolici": [0, 70, 81, 93], "capit": [40, 42, 43, 44, 46, 47, 48, 49, 50, 54, 59, 66, 67, 75, 81, 88, 94], "caption": 83, "captur": [27, 28, 70, 97], "card": [53, 58], "carefulli": 20, "case": [0, 1, 2, 5, 6, 8, 9, 10, 12, 20, 22, 25, 26, 27, 28, 29, 36, 73, 74, 75, 77, 78, 80, 82, 90, 93], "cast": [28, 82], "cast_to_dtyp": 82, "castsiz": 1, "cat": [20, 27, 30, 56], "categor": [12, 28, 82], "categori": 85, "categorical_sampl": 82, "caus": [2, 3, 17, 19, 29, 70, 80, 92, 93], "causal": [27, 82, 83, 97], "cautiou": 19, "caveat": 77, "cd": [14, 15, 20, 27, 65, 73, 88, 92, 94], "ceil": [1, 84], "ceil_mod": [82, 83], "ceildiv": 1, "center": [22, 23], "central": 85, "certain": [2, 7, 15, 67, 71, 82], "cg": 84, "chain": 27, "challeng": [26, 71], "chanc": [9, 29, 81], "chang": [2, 5, 6, 8, 9, 10, 17, 19, 21, 23, 24, 27, 28, 65, 69, 70, 71, 73, 80, 82, 84, 87, 89, 92, 94, 98], "channel": [29, 82, 90, 93], "char": [0, 1], "charg": [6, 16, 97], "chart": 22, "chat": [12, 23, 35, 38, 40, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 58, 59, 62, 63, 66, 67, 69, 88, 93, 94], "chatbot": 58, "chatcmpl": 88, "chatglm": [69, 82, 90, 91, 93], "chatglm2": [69, 91, 93], "chatglm3": [69, 84, 91, 93], "chatglm_vers": 84, "chatglmconfig": 84, "chatglmforcausallm": 84, "chatglmgenerationsess": 87, "chatglmmodel": 84, "check": [2, 3, 40, 66, 67, 70, 74, 76, 77, 79, 80, 82, 87, 88, 89, 92, 93, 95], "check_accuraci": 15, "check_config": 84, "check_gpt_mem_usag": 89, "checkbeamsearchdiversityr": 0, "checkbeamwidth": 0, "checkbeamwidtharrai": 0, "checkearlystop": 0, "checklengthpenalti": 0, "checkminp": 0, "checkmintoken": 0, "checknorepeatngrams": 0, "checknumreturnsequ": 0, "checkpoint": [14, 17, 18, 19, 20, 26, 27, 28, 29, 30, 46, 54, 64, 69, 70, 73, 75, 77, 86, 87, 88, 90, 92, 93, 95], "checkpoint_dir": [10, 13, 14, 15, 16, 19, 29, 73, 88, 92], "checkposteriorvalu": 0, "checkremotedesc": 0, "checkrepetitionpenalti": 0, "checktemperatur": 0, "checktopk": 0, "checktopp": 0, "checktoppdecai": 0, "checktoppmin": 0, "checktoppresetid": 0, "chef": 92, "chmod": 31, "choic": [0, 12, 25, 27, 29, 54, 73, 76, 82, 87, 88, 97], "choos": [16, 19, 26, 28, 77, 82, 93], "chosen": [28, 89, 99], "chrome": 72, "chrono": 0, "chunk": [0, 8, 28, 29, 64, 68, 70, 80, 82, 87, 89, 93], "chunk_dim": 83, "chunk_length": 93, "chunk_scan": 82, "chunk_siz": [82, 84], "chunkedcontextnexttoken": 1, "chunkedcontextnexttokenshost": 1, "ci": 1, "circular": 5, "citi": [59, 88], "ckpt": [54, 73, 88], "ckpt_dir": [16, 19, 84], "ckpt_llama_3": 16, "cl": [14, 19], "claim": [1, 17], "claimpag": 1, "claimpageswithevict": 1, "clamp": [70, 93], "clamp_val": 70, "class": [0, 1, 2, 5, 6, 7, 8, 13, 14, 16, 17, 19, 25, 29, 36, 43, 44, 46, 49, 52, 53, 54, 65, 69, 70, 76, 77, 80, 82, 83, 84, 85, 86, 87, 92, 93, 95, 96, 97, 99], "class_dropout_prob": 83, "class_label": 83, "classic": [16, 64], "classifi": [83, 84], "classmethod": [14, 19, 70, 83, 84, 87], "classvar": 70, "clean": [20, 65, 72, 92], "clear": [67, 79, 87], "clearli": 81, "cli": [15, 20, 36, 64, 73, 76, 77, 79, 80, 88], "click": [31, 32], "client": [0, 3, 30, 63, 74], "client_id": 52, "clientid": 0, "clip": 82, "clip_before_cast": 82, "clip_qkv": [83, 84], "clip_vision_model": 84, "clipvisiontransform": 84, "clock": 26, "clone": [10, 20, 65, 69, 75, 88, 92, 94], "clone_input": 7, "close": [5, 19, 20, 29, 80, 89], "closur": 82, "cloud": [22, 31, 32], "cls_token": 83, "cluster": [6, 16, 26, 29, 30, 67, 70, 93], "cluster_info": 93, "cluster_kei": [29, 93], "cluster_s": 30, "cmake": [65, 93], "cnn_dailymail": [59, 70, 84], "co": [0, 10, 20, 27, 28, 34, 61, 69, 82, 83, 88, 92], "coalesc": 52, "coast": 88, "code": [2, 5, 7, 8, 11, 12, 16, 19, 25, 26, 28, 30, 36, 55, 56, 57, 64, 69, 70, 71, 72, 73, 82, 90, 91, 92, 93, 95, 98, 99], "codebas": [8, 95], "codellama": 93, "codepath": 93, "codeqwen": 93, "coderham": 93, "cogvlm": [91, 93], "cogvlmattent": 83, "cogvlmconfig": 84, "cogvlmforcausallm": 84, "coher": [6, 93], "cohereconfig": 84, "cohereforcausallm": 84, "collabor": [6, 26, 28, 59, 82], "collect": [1, 7, 11, 12, 16, 26, 28, 70, 74, 82, 95], "collect_and_bia": 83, "color": [58, 79], "column": [10, 82, 90], "columnlinear": [10, 14, 83], "com": [19, 20, 26, 65, 82, 88, 92, 93, 94], "combin": [0, 7, 12, 23, 26, 27, 28, 29, 54, 55, 56, 57, 73, 74, 77, 79, 83, 93, 97, 99], "combinedtimesteplabelembed": 83, "combinedtimesteptextprojembed": 83, "come": [6, 10, 22, 75, 76, 79, 81, 89, 92], "comm": 70, "comma": [82, 87], "command": [9, 10, 14, 15, 16, 19, 20, 30, 31, 32, 55, 56, 57, 65, 69, 72, 73, 75, 80, 85, 88, 89, 92, 93, 94], "commandr": 93, "comment": 93, "commit": 28, "commmod": 0, "common": [0, 5, 8, 9, 12, 20, 28, 40, 51, 69, 70, 82, 89, 98], "common_prefix": 51, "commonli": [7, 26, 30, 93], "commstat": 0, "commtyp": 0, "commun": [0, 2, 6, 11, 16, 28, 29, 59, 69, 71, 77, 82, 91, 93], "communicationmod": [0, 2], "communicationtyp": 0, "compani": 53, "compar": [1, 2, 17, 22, 23, 25, 27, 28, 77, 79, 80, 81, 82, 97], "comparison": [6, 22, 26, 27, 73], "compat": [12, 19, 27, 30, 65, 80, 83, 88, 91, 93, 95], "compbin": 10, "compil": [6, 11, 18, 64, 67, 70, 71, 72, 73, 82, 92], "complet": [0, 1, 2, 3, 6, 8, 9, 12, 33, 34, 36, 60, 61, 63, 65, 69, 70, 71, 73, 74, 75, 79, 80, 88, 93, 98, 99], "completion_token": 88, "completionoutput": [36, 53, 70], "complex": [7, 8, 12, 16, 26], "compli": 30, "complic": [27, 28, 95], "compon": [2, 3, 5, 16, 18, 25, 26, 27, 28, 64, 90, 96], "compos": [0, 6, 73], "comprehens": [20, 30, 71], "compress": [21, 28], "compris": 25, "comput": [0, 1, 4, 5, 6, 9, 12, 16, 21, 22, 23, 25, 26, 27, 28, 29, 42, 46, 47, 49, 50, 52, 70, 72, 73, 76, 77, 81, 82, 89, 92, 93, 95, 96, 97, 98], "compute_relative_bia": 83, "computecontextlogit": 1, "computegenerationlogit": 1, "computenumpackedmask": 1, "concat": [14, 26, 82], "concat_kvcach": 26, "concaten": [5, 10, 17, 26, 82, 95], "conced": 51, "concept": [16, 73, 78, 93, 98], "conceptu": 1, "concern": [16, 89], "conclus": 78, "concret": 95, "concur": 51, "concurr": [1, 2, 12, 20, 22, 26, 27, 28, 73, 93], "cond_proj_dim": 83, "conda": 93, "condit": [0, 1, 3, 6, 7, 12, 73, 82, 83, 93], "condition": 82, "conditioning_embed": 83, "conditioning_embedding_dim": 83, "conduct": [5, 73], "confess": 51, "config": [0, 1, 5, 9, 10, 13, 14, 17, 19, 20, 21, 27, 28, 30, 37, 70, 73, 79, 83, 84, 85, 87, 92, 93, 95, 98], "config_class": 84, "config_dir": 84, "config_fil": [30, 70, 84], "configdict": 70, "configur": [0, 1, 2, 4, 5, 8, 12, 17, 18, 20, 23, 29, 30, 43, 44, 45, 49, 53, 54, 58, 65, 67, 70, 73, 74, 75, 78, 79, 81, 84, 87, 89, 92, 93, 97], "configuration_llama": 95, "configuration_mymodel": 95, "configuration_util": 95, "confirm": [42, 46, 47, 49, 50], "conform": 70, "conjunct": 79, "connect": [0, 11, 16, 75, 76, 78], "connectioninfo": 0, "connectioninfotyp": 0, "connectionmanag": 0, "connectremoteag": 0, "consecut": 6, "consequ": [2, 25, 76, 80], "conserv": [0, 81], "consid": [0, 1, 10, 12, 20, 25, 58, 59, 70, 74, 79, 82, 95, 99], "consider": [19, 25, 36], "consist": [7, 19, 22, 26, 70, 71, 73, 75, 82, 90, 92, 97], "consol": 31, "consolid": 12, "const": [0, 1, 3], "const_iter": 1, "constant": [1, 5, 82, 89], "constant_to_tensor_": 82, "constantli": [42, 46, 47, 49, 50], "constants_to_tensors_": 82, "constantthreshold": 1, "constexpr": [0, 1], "constpointercast": 1, "constrain": [6, 25], "constraint": [0, 5, 6, 25, 67, 82], "construct": [0, 1, 3, 12, 16, 73, 82, 93, 97], "constructor": [0, 13, 58, 69, 88, 97], "consult": [12, 65, 72], "consum": [0, 7, 28, 70, 82], "consumpt": [5, 22, 27, 29], "contact": 82, "contain": [0, 1, 2, 3, 5, 6, 7, 8, 10, 11, 15, 16, 17, 18, 19, 26, 29, 30, 32, 55, 56, 57, 66, 67, 70, 71, 73, 74, 82, 84, 87, 88, 90, 91, 93, 94, 96, 97], "container_imag": [55, 56, 57], "container_img": 30, "content": [1, 10, 19, 30, 31, 33, 34, 35, 45, 60, 61, 64, 82, 88, 89, 93], "context": [0, 2, 4, 9, 25, 27, 28, 29, 64, 68, 70, 73, 78, 82, 87, 89, 92, 93, 97, 98, 99], "context_chunking_polici": [70, 81], "context_fmha": [10, 29], "context_fmha_fp32_acc": 93, "context_fmha_typ": [5, 89], "context_init": 99, "context_len": [87, 97], "context_length": [82, 83, 87, 92], "context_logit": [70, 87], "context_mem_s": 87, "context_onli": 70, "context_parallel_s": 70, "context_phas": 5, "context_pre_onli": 83, "context_request": 99, "contextchunkingpolici": [0, 70, 81, 93], "contextexecutor": 2, "contextfmha": 1, "contextidx": 0, "contextlogit": 0, "contextmanag": 69, "contextparallel": 1, "contextphaseparam": [0, 2, 70], "contextpositionid": 1, "contextprefillposit": 0, "contextrequest": 1, "contextrequestid": 2, "contextrespons": 2, "contigu": [2, 8, 76, 82, 93], "continu": [1, 3, 5, 12, 23, 25, 29, 70, 71, 77, 79, 87, 99], "contract": 73, "contrast": [6, 12, 97], "contrib": 21, "contribut": [19, 27, 28, 73, 82, 93], "contributor": [26, 89], "control": [0, 2, 5, 6, 7, 11, 36, 40, 41, 70, 72, 73, 75, 81, 82, 83, 87, 90, 93], "conv": 82, "conv1d": [29, 82, 83], "conv2d": [82, 83], "conv3d": [82, 83], "conv_bia": 82, "conv_kernel": 87, "conv_stat": 84, "conv_state_or_ptr": 82, "conv_transpose2d": 82, "conv_weight": 82, "conveni": [1, 14, 19, 65], "convent": [19, 82], "convers": [1, 17, 24, 25, 58, 64, 88, 93], "convert": [0, 1, 10, 13, 14, 15, 16, 17, 19, 71, 73, 75, 77, 88, 92, 93, 97], "convert_and_load_weights_into_trtllm_llama": 19, "convert_checkpoint": [10, 13, 14, 15, 16, 19, 75, 76, 88, 92, 93], "convert_coneckpoint": 4, "convert_hf_mpt_legaci": 93, "convert_load_format": 70, "convert_util": 93, "convert_weights_from_custom_training_checkpoint": 19, "convkernel": 1, "convolut": [0, 87], "convtranspose2d": 83, "coordin": [12, 64, 82], "copi": [0, 1, 2, 9, 12, 29, 32, 70, 77, 82, 89, 93, 97], "copy_on_partial_reus": 70, "copyfrom": 1, "copyonpartialreus": 0, "copytask": 1, "copytaskmappag": 1, "copyto": 0, "copytocpu": 0, "copytogpu": 0, "copytomanag": 0, "copytopag": 1, "copytopin": 0, "copytopooledpin": 0, "core": [6, 7, 10, 13, 16, 19, 21, 22, 24, 28, 65, 69, 73, 76, 88, 92, 93, 96], "corner": 28, "coroutin": [47, 48, 70], "correct": [2, 3, 5, 10, 12, 27, 93], "correctli": [9, 82, 93, 95], "correspond": [0, 1, 2, 4, 5, 7, 8, 10, 12, 17, 19, 27, 30, 70, 72, 80, 82, 83, 87, 90, 92, 93, 95], "cost": [9, 16, 26, 27, 28, 73, 76, 89, 93], "costli": 26, "could": [0, 2, 7, 8, 9, 15, 46, 47, 48, 49, 50, 59, 70, 75, 89, 92, 93], "couldn": 79, "count": [0, 1, 6, 30, 38, 39, 69, 73, 84, 88], "count_include_pad": [82, 83], "countlocallay": 1, "countlowerranklay": 1, "cours": 12, "court": [42, 46, 47, 49, 50], "cover": [20, 77, 78, 80], "coverag": 70, "cp312": 65, "cp_config": 70, "cp_group": [82, 83], "cp_rank": [82, 83], "cp_size": [82, 83, 86, 93], "cp_split_plugin": 82, "cpp": [2, 3, 5, 6, 16, 20, 28, 30, 56, 64, 65, 72, 73, 74, 75, 92, 93], "cpp_e2e": 87, "cpp_extens": 67, "cpp_llm_onli": 87, "cpp_onli": 65, "cpu": [0, 1, 8, 9, 10, 13, 16, 26, 27, 29, 30, 52, 67, 70, 82, 89, 92, 93, 97], "cpumemusag": [0, 70], "crash": 93, "creat": [1, 2, 3, 7, 8, 9, 12, 13, 14, 16, 18, 19, 26, 30, 31, 36, 42, 46, 47, 48, 49, 50, 51, 52, 59, 60, 61, 62, 69, 70, 71, 73, 74, 75, 79, 80, 82, 83, 84, 87, 88, 89, 93, 95, 96, 97, 99], "create_allreduce_plugin": 82, "create_attention_const_param": 83, "create_builder_config": 13, "create_cuda_graph_metadata": 97, "create_execution_context": 87, "create_fake_weight": 82, "create_network": 16, "create_pytorch_model_based_executor": [98, 99], "create_runtime_default": 84, "create_sinusoidal_posit": 82, "create_sinusoidal_positions_for_attention_plugin": 82, "create_sinusoidal_positions_for_cogvlm_attention_plugin": 82, "create_sinusoidal_positions_long_rop": 82, "create_sinusoidal_positions_yarn": 82, "createloramodul": 1, "creation": [1, 70, 82, 89], "creativ": 6, "criteria": 87, "critic": [26, 73, 92], "crop": 83, "cropped_pos_emb": 83, "cross": [0, 10, 11, 26, 27, 70, 82, 87, 93], "cross_attent": [83, 87], "cross_attention_dim": 83, "cross_attention_mask": [83, 87], "cross_attention_mask_for_context": 87, "cross_attention_mask_for_gen": 87, "cross_attention_norm": 83, "cross_attention_norm_num_group": 83, "cross_attention_packed_mask": 83, "cross_attn_dens": [10, 29], "cross_attn_k": [10, 29], "cross_attn_q": [10, 29], "cross_attn_qkv": [10, 29], "cross_attn_v": [10, 29], "cross_kv": 82, "cross_kv_cache_block_offset": [83, 87], "cross_kv_cache_fract": [70, 87], "cross_kv_cache_gen": [83, 84], "cross_kv_length": 82, "cross_kv_reus": [83, 84], "crossattentionmask": 0, "crosskvcachefract": [0, 93], "crosskvcachestat": 0, "crucial": [12, 16, 25, 96], "ctor": 82, "ctx": 0, "ctx_request_id": 70, "ctxenginepath": 0, "ctxexecutorconfig": 0, "cu": [16, 26], "cu12": 93, "cu128": [66, 67], "cuassert": 92, "cubla": 28, "cublaslt": [29, 80], "cublasltmatmul": 28, "cublasscaledmm": 28, "cuda": [0, 1, 2, 5, 11, 16, 20, 27, 28, 52, 59, 65, 66, 67, 70, 72, 73, 84, 87, 89, 92, 93, 97, 98], "cuda_arch": 65, "cuda_architectur": [20, 65], "cuda_graph_batch_s": [20, 70, 74], "cuda_graph_cache_s": 70, "cuda_graph_inst": 92, "cuda_graph_max_batch_s": 70, "cuda_graph_mod": [70, 87, 92], "cuda_graph_padding_en": [20, 28, 56, 70, 74], "cuda_hom": 67, "cuda_launch_block": 92, "cuda_stream": 92, "cuda_stream_guard": 87, "cuda_stream_sync": 82, "cudadevicegetstreampriorityrang": 1, "cudaevent_t": 1, "cudaeventdisabletim": 1, "cudagraph": 93, "cudagraphcaches": 0, "cudagraphlaunch": 92, "cudagraphmod": 0, "cudamalloc": [1, 2], "cudamallocasync": [1, 2], "cudamemcpyasync": 52, "cudamempool": 1, "cudamempoolptr": 1, "cudaprofilerapi": 72, "cudart": 92, "cudastream": 0, "cudastream_t": 1, "cudastreamcreatewithflag": 1, "cudastreamnonblock": 1, "cudastreamptr": [0, 1], "cudeviceptr": 1, "cudnn": 93, "cufil": 0, "cumemgenericallocationhandl": 1, "cumlogprob": [0, 1], "cumlogprobscba": 1, "cumsum": [82, 93], "cumsumgenerationlength": 1, "cumsumlastdim": 82, "cumsumlength": 1, "cumul": [0, 1, 70, 82], "cumulative_logprob": [36, 53, 70], "curand": 93, "curl": [30, 63, 88], "currenc": 73, "current": [0, 1, 2, 3, 5, 10, 12, 20, 25, 26, 27, 28, 29, 36, 45, 58, 65, 70, 73, 77, 79, 80, 81, 82, 87, 89, 91, 93, 94, 96, 97, 98, 99], "current_stream": 92, "currentexpandindic": 1, "curv": 24, "custom": [6, 16, 19, 21, 26, 27, 29, 40, 41, 43, 44, 52, 53, 54, 65, 71, 77, 80, 82, 87, 93, 96, 97], "custom_all_reduc": 93, "custom_mask": 82, "customallreduc": 93, "customized_key_dict": 17, "customized_preprocess": 17, "customizedmodulea": 17, "customizedmoduleb": 17, "cutlass": [28, 70, 93], "cxx11": 65, "cyclic": [64, 82, 87], "d": [1, 10, 30, 31, 33, 34, 35, 55, 56, 57, 58, 73, 82, 83, 88, 92, 93], "d0": 26, "d04e592bb4f6aa9cfee91e2e20afa771667e1d4b": 73, "d_": 27, "d_6": 27, "dangl": 7, "data": [0, 1, 2, 5, 6, 8, 11, 16, 17, 21, 22, 23, 24, 25, 26, 29, 51, 61, 70, 73, 74, 75, 82, 84, 91, 92, 93, 95], "data_path": 56, "data_typ": [13, 15], "datacontext": 0, "dataset": [26, 27, 28, 34, 56, 59, 61, 70, 72, 77, 93], "dataset_fil": 74, "dataset_path": 73, "datatyp": [0, 1, 6, 16, 82, 87, 90, 92], "datatypetrait": 1, "date": 19, "datetim": 70, "dbrx": [90, 91, 93], "dbrxconfig": 84, "dbrxforcausallm": 84, "dconv": 82, "de": 1, "deactiv": 36, "dead": 93, "deal": [5, 7, 92], "dealloc": [1, 8, 99], "death": [42, 46, 47, 49, 50], "debug": [0, 8, 29, 30, 64, 65, 87, 89, 93], "debug_buff": 92, "debug_mod": [87, 92], "debug_tensors_to_sav": 87, "debugconfig": 0, "debuginputtensor": 0, "debugoutputtensor": 0, "debugtensor": 0, "debugtensornam": 0, "debugtensorsmaxiter": 0, "debugtensorsperiter": 0, "dec": [29, 87, 93], "decai": [0, 6, 70], "decid": [5, 15, 64, 73, 78, 79, 90, 96, 99], "decilmforcausallm": 91, "decis": [58, 82], "declar": [1, 6, 7, 19, 96, 98], "decltyp": [0, 1], "decod": [0, 1, 2, 5, 6, 14, 19, 26, 28, 30, 40, 41, 64, 70, 73, 82, 87, 91, 93, 95, 98], "decode_batch": 87, "decode_duration_m": 70, "decode_regular": 87, "decode_retention_prior": 70, "decode_stream": 87, "decode_words_list": 87, "decode_wrapp": 97, "decodedurationm": 0, "decoder_batch": 1, "decoder_input_id": [84, 87], "decoder_language_adapter_rout": 87, "decoder_lay": 95, "decoder_start_token_id": 29, "decoderbuff": 1, "decoderenginebuff": 0, "decoderetentionprior": 0, "decoderjsonconfigstr": 0, "decoderlay": 95, "decoderlayerlist": 14, "decoderlookaheadbuff": 1, "decodermaskedmultiheadattent": 5, "decodermodel": [0, 84, 95], "decodermodelforcausallm": [14, 19, 84, 95], "decodermodelpath": 0, "decoderst": 93, "decoderxqarunn": 5, "decoding_config": 70, "decoding_typ": [20, 27, 70], "decodingbaseconfig": 70, "decodingconfig": [0, 1], "decodinginputptr": 1, "decodingit": 0, "decodinglayerworkspac": 1, "decodingmod": [0, 1, 93], "decodingoutputptr": 1, "decompos": 5, "decor": 95, "decoupl": [26, 89], "decreas": [21, 22, 77], "dedic": [26, 28, 92], "deduc": [29, 30, 93], "deep": [16, 22, 23, 72, 82, 93], "deeper": 27, "deepgemm": 20, "deeplearn": [82, 92], "deepseek": [30, 63, 72, 74, 91, 93], "deepseek_v1": 93, "deepseek_v2": 93, "deepseek_v3": [26, 93], "deepseekforcausallm": 84, "deepseekv1config": 84, "deepseekv2": 82, "deepseekv2attent": 83, "deepseekv2config": 84, "deepseekv2forcausallm": 84, "deepseekv3forcausallm": 91, "deepseekv3routingimpl": 28, "deepspe": 15, "def": [7, 14, 16, 17, 19, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 58, 59, 66, 67, 75, 77, 80, 81, 88, 92, 94, 95, 99], "default": [0, 1, 2, 3, 4, 5, 6, 9, 15, 17, 19, 27, 28, 29, 30, 31, 36, 54, 59, 64, 65, 70, 72, 74, 77, 78, 79, 80, 81, 82, 84, 87, 88, 89, 90, 92, 93, 95, 97], "default_net": 82, "default_plugin_config": 84, "default_trtnet": 16, "defaultvalu": 1, "defer": 82, "defin": [0, 1, 3, 5, 7, 12, 15, 16, 17, 18, 19, 20, 23, 29, 71, 73, 80, 82, 83, 90, 93, 95, 97], "definit": [3, 5, 8, 18, 19, 26, 64, 71, 82, 92], "deftruth": 93, "degrad": [0, 2, 29, 77], "degre": [42, 46, 47, 49, 50, 52, 74, 77, 80], "delai": [74, 93], "deleg": [82, 97], "delet": [0, 1, 85, 92], "deliv": [20, 21, 24, 26, 27, 74], "delta": [0, 26, 27, 82, 83], "delta_bia": 82, "delta_softplu": 82, "delv": 28, "demand": [26, 28], "demo": [26, 34, 61], "demonstr": [3, 17, 22, 26, 69, 75, 77, 79, 80], "denmark": 51, "denois": 83, "denot": 12, "dens": [4, 5, 10, 15, 17, 82], "dense_4h_to_h": 17, "dense_bia": 83, "dense_h_to_4h": 17, "densiti": 25, "dep": 65, "departur": 51, "depend": [0, 2, 3, 5, 6, 7, 12, 15, 23, 30, 67, 70, 74, 75, 77, 80, 82, 89, 92, 93, 98], "deploi": [12, 15, 30, 64, 67, 70, 71], "deplot": [91, 93], "deploy": [25, 26, 71, 73, 77, 88, 93], "deprec": [29, 70, 71, 73, 93], "deprecationwarn": 73, "depriv": 7, "depth": 12, "dequ": [0, 1], "dequant": [5, 11, 64, 82], "deregistermemori": 0, "deriv": [16, 17, 82, 89, 96], "desc": 0, "descendli": 6, "describ": [0, 5, 6, 8, 9, 10, 12, 14, 16, 17, 18, 20, 24, 32, 34, 61, 65, 69, 73, 74, 80, 82, 90, 92, 97], "descript": [0, 1, 6, 10, 30, 54, 64, 73, 74, 80, 82, 97], "descriptor": 70, "deseri": [0, 19, 52], "deserializeadditionalmodeloutput": 0, "deserializeadditionaloutput": 0, "deserializeagentst": 0, "deserializebool": 0, "deserializecachest": 0, "deserializecachetransceiverconfig": 0, "deserializecommst": 0, "deserializecontextphaseparam": 0, "deserializedatatransceiverst": 0, "deserializedebugconfig": 0, "deserializedecodingconfig": 0, "deserializedecodingmod": 0, "deserializedisservingrequeststat": 0, "deserializedynamicbatchconfig": 0, "deserializeeagleconfig": 0, "deserializeexecutorconfig": 0, "deserializeextendedruntimeperfknobconfig": 0, "deserializeexternaldrafttokensconfig": 0, "deserializeguideddecodingconfig": 0, "deserializeguideddecodingparam": 0, "deserializeinflightbatchingstat": 0, "deserializeiterationstat": 0, "deserializeiterationstatsvec": 0, "deserializekvcacheconfig": 0, "deserializekvcacheretentionconfig": 0, "deserializekvcachestat": 0, "deserializelookaheaddecodingconfig": 0, "deserializeloraconfig": 0, "deserializemodeltyp": 0, "deserializemropeconfig": 0, "deserializeorchestratorconfig": 0, "deserializeoutputconfig": 0, "deserializeparallelconfig": 0, "deserializepeftcacheconfig": 0, "deserializeprompttuningconfig": 0, "deserializerequest": 0, "deserializerequestperfmetr": 0, "deserializerequeststag": 0, "deserializerequeststat": 0, "deserializerequeststatsperiter": 0, "deserializerequeststatsperiterationvec": 0, "deserializerespons": 0, "deserializeresult": 0, "deserializesamplingconfig": 0, "deserializeschedulerconfig": 0, "deserializesocketst": 0, "deserializespecdecfastlogitsinfo": 0, "deserializespecdecodingstat": 0, "deserializespeculativedecodingconfig": 0, "deserializestaticbatchingstat": 0, "deserializestr": 0, "deserializetensor": 0, "deserializetimepoint": 0, "deserializetokenrangeretentionconfig": 0, "design": [1, 11, 12, 16, 17, 19, 20, 25, 26, 27, 28, 69, 75, 88, 96, 97, 98], "desir": [3, 74, 82, 88, 97], "destin": [55, 56, 57], "destroi": [1, 89], "destroyipcmemori": 1, "destructor": 1, "detail": [0, 3, 5, 11, 12, 14, 16, 20, 26, 28, 29, 30, 36, 40, 45, 59, 64, 73, 74, 75, 77, 81, 82, 84, 89, 92, 93, 96, 97, 98], "detect": [0, 3, 30, 70, 82, 93], "detect_format": 17, "determin": [0, 1, 5, 6, 10, 11, 19, 27, 70, 76, 77, 81, 82, 84, 90, 96, 98, 99], "determinenumpag": 1, "determinist": [27, 80, 93], "detoken": [70, 93, 96], "detokenizedgenerationresultbas": 70, "dev": [66, 67, 93], "devel": [31, 32, 65], "develop": [14, 15, 16, 19, 26, 27, 31, 42, 46, 47, 49, 50, 64, 65, 69, 71, 75, 82, 91, 93, 95], "deviat": 74, "devic": [0, 1, 2, 52, 70, 77, 82, 84, 86, 87, 92], "device_id": 87, "device_map": 86, "device_memory_size_v2": 89, "device_request_typ": 84, "deviceallocationnvl": 1, "devicecach": 1, "devicecacheperc": 0, "deviceid": [0, 1, 2], "dgx": [6, 16, 20, 28], "di": 27, "diagon": 82, "diagram": [12, 28], "diamond": [26, 28], "dict": [14, 17, 19, 70, 82, 84, 87, 93, 95, 98], "dict_kei": 92, "dictat": 79, "dictionari": [15, 17, 70, 83], "didn": 79, "differ": [0, 1, 2, 4, 5, 6, 8, 9, 11, 14, 15, 16, 17, 19, 20, 25, 27, 28, 29, 34, 61, 65, 69, 70, 71, 73, 75, 77, 79, 80, 82, 84, 87, 89, 90, 93, 97], "differenti": 82, "difftyp": 1, "diffus": [34, 61, 83, 93], "diffusersattent": 83, "digit": 71, "dilat": [82, 83], "dim": [0, 1, 82, 83, 84, 87, 92], "dim0": 82, "dim1": 82, "dim_head": 83, "dim_in": 83, "dim_out": 83, "dim_rang": 82, "dimems": 1, "dimens": [0, 1, 5, 6, 10, 28, 82, 83, 84, 89, 92, 93, 95], "dimension": 82, "dimrang": 82, "dimtype64": [0, 1], "dir": [36, 65, 69], "direct": [0, 2, 11, 19, 67, 92], "directli": [0, 2, 6, 7, 12, 16, 19, 27, 28, 32, 36, 65, 69, 73, 80, 81, 82, 88, 93, 97, 99], "directori": [0, 3, 14, 15, 16, 17, 19, 29, 55, 56, 57, 65, 70, 73, 74, 75, 84, 87, 88, 93, 95], "disabl": [0, 1, 5, 6, 9, 13, 17, 29, 70, 73, 77, 80, 81, 82, 85, 87, 89, 93], "disable_forward_chunk": 84, "disable_kv_cach": 87, "disable_overlap_schedul": [28, 70], "disable_weight_only_quant_plugin": 84, "disable_xqa": 5, "disablelookahead": 1, "disablelookaheaddecod": 1, "disableseamlesslookaheaddecod": 1, "disadvantag": [19, 76], "disagg_executor": 0, "disaggexecutororchestr": [0, 2], "disaggreg": [0, 64, 70, 93], "disaggregated_param": 70, "disaggregatedparam": 70, "disaggserverbenchmark": [2, 93], "disaggserverutil": 2, "discard": 77, "disclaim": [27, 75, 77, 79, 80], "disclosur": 93, "disconnect": 93, "discourag": [0, 6, 70], "discov": [16, 67], "discrep": [65, 95], "discuss": [5, 27, 75, 77, 80, 81, 93], "disk": [3, 19, 46, 49, 65, 69], "dispatch": [0, 4, 19, 26, 36], "displai": 70, "disservingrequeststat": 0, "disservingstat": 0, "dist": [20, 56, 67, 72, 73, 74, 75], "distanc": [5, 82], "distil": 93, "distinct": [8, 10, 12, 26, 82], "distinguish": 9, "distribut": [1, 4, 5, 6, 16, 26, 40, 41, 73, 82, 87, 89], "distserv": 2, "disturb": 51, "dit": [84, 93], "div": 82, "dive": [27, 71, 72], "divers": [0, 6, 72], "diversity_penalti": 6, "divid": [17, 27, 82, 93], "divup": 82, "dl": 25, "dlsym": 0, "do": [1, 2, 7, 17, 19, 20, 25, 26, 27, 28, 36, 64, 67, 70, 75, 77, 80, 82, 88, 92, 95, 97], "do_cross_attent": [82, 83], "do_layer_norm_befor": 15, "do_sampl": 6, "doc": [1, 20, 24, 26, 32, 77, 80, 82, 92, 93], "docker": [20, 55, 56, 57, 64, 88, 92, 93], "docker_run_arg": 20, "dockerfil": [31, 65], "document": [0, 2, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 19, 22, 23, 25, 27, 33, 34, 35, 36, 37, 38, 39, 45, 60, 61, 62, 65, 67, 68, 72, 74, 75, 81, 82, 89, 90, 92, 96, 97], "doe": [0, 2, 5, 10, 12, 19, 20, 21, 28, 29, 73, 74, 80, 82, 87, 89, 91, 93, 95, 99], "doesn": [1, 5, 26, 31, 36, 73, 79, 80], "dollar": 73, "domain": 11, "domin": [26, 93], "don": [12, 19, 28, 31, 76, 80, 82], "done": [1, 9, 16, 20, 28, 71, 73, 77, 79, 82, 85, 95], "dongjiyingdji": 93, "dora": [29, 82, 83], "dora_plugin": [10, 29, 82], "dot": [17, 26, 82], "doubl": [0, 22, 78, 80, 92], "down": [0, 2, 3, 10, 21, 27, 28, 58, 71, 76, 82, 87], "down_proj": 17, "download": [18, 55, 56, 57, 58, 65, 66, 67, 69, 73, 75, 88, 92, 93], "downscale_freq_shift": 83, "downsid": 80, "downstream": 90, "dp": [20, 21, 24, 26, 27, 28, 93], "dp8": [26, 28], "dprank": 0, "dpsize": 0, "dq": 64, "draft": [0, 1, 26, 27, 29, 64, 70, 87, 93], "draft_indic": 84, "draft_len": 84, "draft_path": 87, "draft_prob": 84, "draft_target_model": 12, "draft_token": [70, 84], "draft_tokens_extern": [29, 84], "draftacceptancethreshold": 1, "draftbuff": 1, "drafter": [12, 70], "draftindic": 1, "draftlen": 1, "draftlogit": 1, "draftoverhead": 0, "draftparticipantid": 0, "draftpath": 1, "draftpathshost": 1, "draftprob": 1, "draftrequestid": 0, "drafttoken": [0, 1], "drafttokenid": 1, "drafttokensextern": 1, "dram": [0, 16, 70], "drastic": 28, "dreamgenx": 93, "drive": [16, 73], "driven": 71, "driver": [89, 93], "drop": [27, 28, 77, 79, 81], "dropout": 83, "dropout_prob": 83, "dry_run": [29, 70, 93], "dst": 1, "dstate": 82, "dstdesc": 0, "dsttype": 1, "dt_proj": 82, "dt_rank": 82, "dtype": [1, 7, 10, 13, 14, 15, 16, 19, 70, 73, 75, 76, 82, 83, 84, 85, 86, 87, 92, 93, 98], "dual": 65, "due": [0, 12, 19, 23, 26, 28, 65, 73, 75, 79, 81, 87, 93, 97], "dummi": [70, 75, 93], "dump": [0, 3, 65, 70], "dump_debug_buff": 87, "duplic": [28, 93], "duplicate_data": 82, "durat": [0, 75], "duration_m": 70, "durationm": 0, "dure": [0, 1, 5, 6, 7, 11, 12, 13, 16, 24, 26, 27, 28, 29, 65, 70, 72, 73, 80, 81, 87, 89, 92, 97, 98], "dynam": [0, 26, 27, 29, 70, 73, 82, 84, 87, 89, 93, 99], "dynamic_batch_config": 70, "dynamic_batch_moving_average_window": 70, "dynamic_quant_bf16tonvfp4": 26, "dynamic_tree_max_topk": [43, 44, 70], "dynamicbatchconfig": [0, 70], "dynamicbatchmovingaveragewindow": 0, "dynamicbatchsizeconfig": 0, "dynamicdecodelay": 1, "dynamicqu": 26, "dynamictreemaxtopk": 0, "dynamictreemaxtopkhost": 1, "dynlibload": 0, "e": [0, 2, 3, 5, 8, 9, 10, 11, 17, 27, 28, 30, 31, 52, 55, 56, 57, 65, 70, 72, 82, 85, 87, 90, 92, 93, 95], "e2": [28, 64], "e4m3": [11, 22], "e5m2": 22, "each": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 20, 26, 27, 28, 29, 30, 36, 52, 55, 56, 57, 70, 73, 74, 75, 76, 79, 80, 81, 82, 83, 85, 87, 89, 90, 92, 93, 96, 97, 98, 99], "eager": [28, 71, 93], "eagl": [0, 1, 29, 40, 41, 43, 64, 70, 84, 87, 93], "eagle2": [40, 41], "eagle3_one_model": 70, "eagle_choic": [43, 44, 70, 87], "eagle_dynamic_tree_max_top_k": 87, "eagle_posterior_threshold": 87, "eagle_temperatur": 84, "eagle_use_dynamic_tre": 87, "eaglechoic": [0, 1], "eagleconfig": [0, 1, 84], "eagledecodingconfig": [43, 44, 70], "eagleforcausallm": 84, "eagleinput": 1, "eaglelastinput": 1, "eaglenetctxcontextlengthshost": 1, "eaglenetctxpastkeyvaluelengthshost": 1, "eaglenetctxrequesttypeshost": 1, "eaglenetgencontextlengthshost": 1, "eaglenetgenpastkeyvaluelengthshost": 1, "eaglenetgenrequesttypeshost": 1, "ealge2": 27, "earli": [87, 92, 93], "earlier": [0, 15, 77, 92], "early_stop": [6, 70, 87, 93], "early_stop_criteria": 87, "earlystop": [0, 1, 6], "eas": [18, 71, 74], "easi": [25, 75], "easier": [16, 19, 20, 27, 73], "easili": [17, 18, 20, 26, 71, 82], "east": [14, 16, 92], "eastern": 88, "ebnf": [0, 3, 70], "echo": [30, 31, 32, 56, 57], "eddi": 93, "edg": 22, "edit": [12, 65], "ef648e7489c040679d87ed12db5d3214": 88, "effect": [0, 2, 6, 11, 12, 26, 27, 28, 29, 67, 70, 77, 79, 80], "effici": [4, 5, 6, 9, 12, 16, 18, 26, 27, 28, 29, 34, 42, 46, 47, 49, 50, 61, 89, 91, 94, 96, 97, 98], "effort": [12, 15, 27, 28, 59, 77, 93], "eg": 74, "eight": [20, 21], "einop": 82, "einstein": 82, "einsum": 82, "einsum_eq": 82, "either": [0, 1, 2, 3, 18, 26, 28, 46, 49, 59, 70, 82, 89, 92, 93], "element": [0, 1, 5, 6, 10, 11, 70, 82, 83, 90], "element_typ": 1, "elementwis": [7, 82], "elementwise_affin": 83, "elementwise_binari": 82, "elementwise_sub": 7, "elementwise_sum": 7, "elementwiseoper": [7, 82], "eleutherai": 73, "elif": 99, "elimin": [2, 12, 26, 28, 29, 71, 73, 77, 79, 93], "ellipsi": 82, "els": [0, 16, 17, 19, 36, 52, 54, 59, 82, 92, 99], "elsinor": 51, "emb": [16, 61, 83], "embark": 71, "embed": [0, 9, 14, 27, 29, 70, 73, 82, 87, 93, 95, 97], "embed_dim": 83, "embed_posit": 83, "embed_positions_for_gpt_attent": 83, "embed_positions_for_gpt_attention_loc": 83, "embed_positions_loc": 83, "embed_token": [17, 95], "embedding_bia": 70, "embedding_dim": 83, "embedding_multipli": 84, "embedding_parallel_mod": 70, "embedding_scal": 84, "embedding_sharding_dim": [15, 84], "embeddingbia": [0, 1], "embeddingt": [0, 1], "emerg": [25, 26], "emit": 70, "emphasi": 15, "emploi": [12, 96, 99], "empow": 26, "empti": [0, 1, 12, 36, 82, 93, 99], "emptybuff": 1, "emptygenslot": 0, "emptytensor": 1, "emul": [82, 93], "en": 93, "enabl": [0, 2, 3, 5, 6, 7, 10, 11, 12, 13, 16, 17, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 36, 42, 48, 50, 64, 65, 66, 67, 70, 73, 75, 79, 81, 82, 83, 84, 85, 87, 88, 90, 92, 93, 95, 97, 98], "enable_allreduc": 82, "enable_attention_dp": [20, 30, 56, 70], "enable_batch_size_tun": 70, "enable_block_reus": [30, 43, 44, 51, 54, 70], "enable_build_cach": [70, 93], "enable_chunked_context": [87, 93], "enable_chunked_prefil": [70, 93], "enable_context_fmha_fp32_acc": [70, 87], "enable_debug_output": [29, 70, 92], "enable_forward_chunk": 84, "enable_fp8": [11, 59], "enable_if_t": 1, "enable_iter_perf_stat": [30, 70], "enable_iter_req_stat": 70, "enable_kv_cache_reus": 9, "enable_layerwise_nvtx_mark": 70, "enable_lora": [58, 70], "enable_max_num_tokens_tun": [70, 93], "enable_min_lat": 70, "enable_multi_devic": 93, "enable_nvfp4": 59, "enable_overlap_schedul": 30, "enable_partial_reus": 70, "enable_prompt_adapt": [70, 93], "enable_qkv": 83, "enable_tqdm": 70, "enable_trt_overlap": 93, "enable_trtllm_sampl": 70, "enable_ucx": 93, "enable_xqa": 93, "enableattentiondp": [0, 1], "enablebatchsizetun": 0, "enableblockreus": [0, 9], "enablechunkedcontext": 0, "enablecontextfmhafp32acc": 0, "enabled_with_fp32_acc": 5, "enablelookaheaddecod": 1, "enablemaxnumtokenstun": 0, "enablepartialreus": 0, "enableseamlesslookaheaddecod": [0, 1], "enabletrtoverlap": 0, "enc": [29, 87, 93], "enc_dec": 6, "encapsul": [5, 6, 16, 82], "encdecmodelrunn": 87, "encod": [0, 5, 6, 22, 26, 29, 70, 82, 87, 90, 91, 93], "encode_base64_content_from_url": 61, "encoded_vocab": [0, 3], "encodedvocab": [0, 3], "encoder_hidden_st": [83, 84], "encoder_input_featur": 87, "encoder_input_id": 87, "encoder_input_len_rang": 93, "encoder_input_length": [82, 83, 87], "encoder_language_adapter_rout": 87, "encoder_max_input_length": [83, 87], "encoder_output": [83, 84, 87], "encoder_output_length": 87, "encoder_run": 87, "encoderenginebuff": 0, "encoderhiddens": 1, "encoderinputfeatur": 0, "encoderinputtokenid": 0, "encoderjsonconfigstr": 0, "encoderlen": 0, "encodermodel": [0, 84], "encodermodelpath": 0, "encoderoutput": 0, "encoderoutputlength": 0, "encount": [17, 20, 67, 92], "encourag": [0, 6, 19, 70], "end": [0, 1, 5, 6, 16, 27, 29, 43, 44, 49, 53, 54, 59, 70, 71, 73, 77, 80, 81, 82, 88, 93, 98], "end_dim": 82, "end_id": [70, 87, 93], "end_token": [0, 70], "endeavor": 26, "endid": [0, 1], "endpoint": [38, 39, 70, 88, 93], "endswith": 17, "enforc": [75, 82], "engin": [0, 1, 2, 3, 5, 6, 7, 10, 12, 13, 18, 19, 24, 26, 27, 28, 29, 30, 36, 46, 49, 58, 64, 67, 70, 74, 76, 77, 79, 80, 81, 82, 84, 87, 89, 92, 93], "engine_buff": 87, "engine_dir": [13, 14, 15, 16, 19, 70, 73, 75, 87, 88, 92], "engine_inspector": 87, "engine_llama_3": 16, "engine_nam": 87, "engine_output": 29, "engineaddr": 1, "enginebuff": [0, 1], "enginefilenam": 1, "engineinput": 1, "engineoutput": 1, "enginepath": 1, "engines": 1, "enhanc": [4, 6, 12, 20, 26, 27, 28, 71, 81, 89, 94, 97], "enjoi": [32, 42, 46, 47, 49, 50, 52], "enough": [5, 9, 20, 27, 79, 89, 96, 99], "enqueu": [0, 3, 16, 87, 89, 93], "enqueuecontext": 0, "enqueuegener": 0, "enqueuerequest": [0, 2, 3], "ensur": [2, 3, 4, 7, 19, 27, 65, 70, 73, 79, 85, 95, 98], "enter": [7, 31, 74, 79, 98], "enterpris": 45, "entir": [0, 3, 10, 16, 21, 26, 70, 71, 73, 74, 82, 89, 98], "entri": [0, 10, 40, 50, 66, 67, 73, 82, 88, 93], "entrypoint": [31, 69, 75], "enum": [0, 1, 2], "enumer": [0, 1, 48, 52, 94], "env": [30, 33, 34, 35, 37, 38, 39, 73], "envelop": 53, "environ": [6, 12, 20, 26, 34, 55, 56, 57, 61, 64, 65, 67, 72, 73, 75, 77, 79, 80, 92, 93, 94, 97], "environment": 17, "eo": [6, 70], "eof": [20, 27, 30, 56], "eos_token_id": [3, 87], "ep": [4, 20, 26, 27, 30, 73, 82, 83], "ep2": 26, "ep2tp4": 26, "ep4tp2": 26, "ep8": 28, "ep8tp8": 26, "ep_siz": [30, 37], "epsilon": [0, 82], "eq": 82, "equal": [0, 1, 3, 4, 28, 29, 36, 76, 82, 83, 89], "equal_progress": [70, 81], "equat": [24, 82], "equip": [2, 18], "equival": [26, 28, 77, 82, 95], "equvili": 29, "erenup": 93, "err": [55, 56, 57], "error": [0, 2, 3, 10, 19, 28, 29, 30, 59, 64, 65, 67, 70, 75, 79, 89, 93], "errorcod": 69, "errormsg": 0, "especi": [7, 27, 29, 42, 46, 47, 49, 50, 52, 76, 79, 98], "essenti": [12, 73], "establish": 28, "estim": [59, 73, 93, 99], "et": 21, "etc": [0, 1, 12, 67, 70, 72, 77, 80, 87, 89, 92, 95], "ethnzhng": 93, "eval": 45, "evalu": [11, 22, 23, 28, 64, 93], "even": [2, 5, 6, 16, 19, 25, 26, 29, 51, 75, 79, 82, 89], "evenli": [4, 26], "event": [0, 1, 40, 41, 64, 70], "event_buffer_max_s": [51, 70], "event_id": 51, "eventbuffermaxs": 0, "eventid": 0, "eventptr": 1, "ever": [0, 80], "everi": [0, 3, 17, 26, 28, 73, 75, 76, 82, 87], "everyon": 27, "everyth": 16, "evict": [0, 1, 8, 9, 10, 27, 71, 73, 75, 79], "evolv": [5, 19, 26, 71, 90, 98], "ex": [56, 57], "exact": [5, 89], "exam": 26, "examin": 12, "exampl": [0, 5, 6, 7, 9, 12, 13, 14, 18, 19, 21, 23, 25, 27, 30, 36, 45, 52, 55, 59, 64, 65, 69, 70, 74, 75, 76, 77, 78, 79, 80, 81, 82, 87, 88, 89, 90, 91, 92, 93, 94, 95, 97, 99], "example_logits_processor": 52, "exaon": [17, 91, 93], "exc": 48, "exce": [0, 2, 70, 81, 82], "exceed": [0, 89], "except": [0, 3, 5, 6, 19, 26, 27, 29, 54, 76, 82, 92, 93], "excess": 5, "exchang": 70, "excit": [42, 46, 47, 48, 49, 50], "exclud": [70, 77, 82, 93], "exclude_input_from_output": 70, "exclude_modul": [15, 70, 93], "excludeinputfromoutput": 0, "exclus": [1, 6, 90, 93], "exec": 72, "execut": [0, 2, 3, 6, 10, 12, 16, 18, 19, 26, 28, 64, 70, 71, 72, 73, 79, 81, 82, 87, 88, 89, 96, 99], "executor": [1, 2, 9, 12, 13, 18, 36, 52, 58, 64, 70, 71, 73, 81, 87, 89, 93, 96], "executor_config": 98, "executorconfig": [0, 3, 13], "executorexampledisaggreg": 2, "executorexamplefastlogit": 93, "exhaust": [0, 18], "exist": [1, 6, 9, 10, 12, 17, 19, 26, 28, 29, 51, 67, 70, 73, 87, 93, 97], "exit": [74, 87], "exp": 82, "expand": [0, 23, 25, 27, 82, 87, 93], "expand_dim": 82, "expand_dims_lik": 82, "expand_mask": 82, "expand_shap": 82, "expans": 82, "expect": [0, 5, 6, 11, 14, 16, 17, 19, 23, 27, 29, 36, 55, 56, 57, 64, 70, 73, 75, 78, 82, 92, 93], "expens": [3, 12, 71, 76, 77, 81], "experi": [12, 24, 25, 26, 28, 69, 71, 72, 73, 92], "experiment": [5, 6, 12, 17, 27, 30, 55, 56, 57, 64, 73, 90, 93, 94], "expert": [10, 20, 30, 50, 64, 70, 80, 93], "expertis": [26, 28], "expir": 0, "explain": [6, 16, 18, 28, 79, 82, 89, 90, 96, 97], "explan": [20, 28, 80, 87, 89], "explicit": [0, 1, 12, 82, 93], "explicit_draft_token": [12, 29, 84], "explicitdrafttoken": [0, 1], "explicitdrafttokensinput": 1, "explicitdrafttokenslastinput": 1, "explicitdrafttokensmodul": 1, "expliciteosstop": 0, "explicitli": [1, 2, 7, 12, 16, 17, 28, 29, 30, 36, 70, 93], "explor": [12, 26, 28, 71], "expon": 22, "exponenti": 12, "export": [2, 11, 15, 19, 20, 26, 27, 29, 30, 38, 39, 55, 56, 57, 73, 86, 87, 92, 93], "export_fmt": 94, "expos": [0, 6, 16, 32, 65, 77, 93], "express": [0, 3, 70, 82], "extend": [0, 3, 9, 16, 26, 27, 28, 70, 80, 82, 93], "extended_runtime_perf_knob_config": [70, 93], "extendedruntimeperfknobconfig": [0, 70], "extens": [15, 18, 67, 71, 73, 93], "extern": [0, 7, 8, 17, 87, 89], "external_checkpoint_dir": 17, "external_kei": 17, "external_weight": 17, "externaldrafttoken": 0, "externaldrafttokensconfig": [0, 1], "externaldrafttokensinput": 1, "externalstream": 52, "extra": [0, 2, 5, 9, 12, 15, 20, 26, 27, 29, 30, 37, 67, 70, 74, 76, 77, 87, 93], "extra_arg": 56, "extra_data": 70, "extra_id": 9, "extra_llm_api_opt": [20, 27, 30, 37, 56, 73, 74], "extra_resource_manag": 70, "extra_token": 83, "extract": [0, 3, 65, 72, 78, 82, 87], "extrapol": 82, "extrem": [16, 26, 77, 79, 80], "f": [0, 5, 6, 31, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 58, 59, 66, 67, 70, 72, 75, 81, 82, 88, 92, 94], "face": [3, 10, 13, 18, 19, 36, 70, 73, 84, 88, 93], "facilit": [7, 12, 88], "fact": [71, 73, 80], "factor": [25, 28, 76, 77, 82, 83, 89, 90], "factori": [19, 70, 87, 93], "factual": 6, "fail": [70, 87, 89, 92, 99], "failur": [17, 93], "fairli": 16, "fairseq": [91, 93], "fake": [9, 93], "fakebuff": 1, "falcon": [15, 25, 69, 73, 90, 91, 93], "falconconfig": 84, "falconforcausallm": 84, "falconmodel": 84, "fall": [11, 67, 74, 93], "fallback": 17, "fals": [0, 1, 2, 3, 5, 6, 7, 9, 15, 26, 28, 29, 30, 45, 51, 54, 56, 70, 82, 83, 84, 85, 86, 87, 93], "false_output_valu": 82, "false_valu": 82, "famili": [5, 17, 91, 93], "familiar": [6, 16, 69, 75, 76, 78, 88], "famou": [6, 59], "faq": 64, "far": [0, 3, 27], "fast": [0, 5, 8, 12, 70, 73, 76, 93], "fast_build": [29, 70, 93], "fastapi": 93, "fastapi_serv": 93, "faster": [5, 19, 22, 23, 27, 28, 29, 74, 75, 82], "fasterdecod": 54, "fastlogit": 0, "fault": 93, "favor": 93, "favorit": 58, "fc": [15, 16, 17, 92], "fc_gate": 83, "fc_gate_dora": 83, "fc_gate_lora": 83, "fc_gate_plugin": 83, "featur": [0, 2, 3, 5, 7, 8, 10, 11, 12, 15, 16, 17, 19, 25, 26, 27, 28, 29, 55, 56, 57, 64, 65, 73, 77, 79, 80, 81, 82, 85, 87, 91, 94, 95, 97], "feature_dim": 87, "februari": 28, "fed": [74, 84], "feed": 82, "feedback": 93, "feedforward": 4, "feel": 58, "fetch": [0, 27, 30, 96], "few": [9, 16, 19, 25, 28, 79], "fewer": [5, 12, 21, 97], "ffn": [4, 26], "ffn_hidden_s": 83, "fhma": 93, "field": [0, 6, 15, 19, 30, 32, 36, 70, 71, 73, 77, 84, 85, 90, 93, 97], "field_nam": 70, "figur": [26, 27], "file": [0, 3, 4, 5, 7, 9, 15, 16, 17, 19, 20, 27, 29, 30, 38, 39, 67, 70, 72, 73, 74, 87, 88, 93, 95], "filepath": 1, "filesystem": [0, 1], "fill": [1, 17, 32, 42, 46, 47, 49, 50, 82, 97], "fill_attention_const_params_for_long_rop": 83, "fill_attention_const_params_for_rop": 83, "fill_attention_param": 83, "fill_none_tensor_list": 83, "fill_valu": [52, 82], "fillemptyfieldsfromruntimedefault": 0, "filloper": 82, "filltaskstensor": 1, "filter_medusa_logit": 87, "final": [0, 1, 10, 26, 27, 29, 30, 31, 36, 82, 99], "final_logit_softcap": 84, "final_output_id": 87, "finalize_decod": 87, "find": [20, 28, 77, 82, 92, 93], "find_best_medusa_path": 87, "fine": [12, 20, 28, 73, 80, 83], "finer": 7, "finetun": 26, "finish": [0, 1, 3, 6, 8, 19, 27, 36, 53, 69, 70, 71, 73, 87, 96, 98], "finish_reason": [53, 70, 88, 93], "finishedst": 1, "finishedsum": 1, "finishreason": [0, 1, 93], "first": [0, 1, 2, 3, 5, 6, 7, 9, 10, 12, 18, 23, 25, 27, 28, 29, 30, 31, 67, 69, 70, 73, 74, 75, 77, 79, 80, 81, 82, 89, 92, 93, 95, 97, 98, 99], "first_come_first_serv": [70, 81], "first_gen_token": 70, "first_lay": 87, "firstgentoken": 0, "firstit": 0, "firstli": [28, 31, 79, 89], "firstscheduledtim": 0, "firsttokentim": 0, "fit": [1, 5, 21, 22, 70, 76, 77, 99], "fitting_request": 99, "fix": [8, 10, 12, 28, 73, 89], "fjosw": 93, "flag": [0, 1, 3, 5, 10, 19, 24, 30, 36, 64, 73, 77, 78, 79, 81, 82, 89, 93], "flags_siz": 1, "flan": [90, 91], "flash": [5, 16], "flashattent": [5, 16, 88], "flashinf": 97, "flashinferattent": 97, "flashmla": [27, 93], "flatten": [1, 10, 24, 82, 83], "flattenedinouts": 1, "flattenn": 1, "flayer": 7, "flayerinfomemo": 7, "flexibl": [12, 19, 26, 36, 65], "flight": [1, 18, 64, 73, 79, 81, 88, 89, 93], "flip": 82, "flip_sin_to_co": 83, "float": [0, 1, 6, 13, 15, 16, 22, 52, 70, 81, 82, 83, 84, 87, 90], "float16": [7, 10, 13, 14, 15, 19, 29, 76, 82, 84, 85, 88, 92], "float2": 82, "float32": [0, 15, 29, 82, 83, 84, 85], "floattensor": 95, "floattyp": [0, 1], "floor_div": 82, "floordiv": 82, "flop": 28, "flow": [7, 19, 26, 28, 75, 76, 77, 79, 80, 93, 96, 99], "fly": [5, 82, 90], "fmha": [0, 29, 70, 82, 87, 89, 93], "fmt_dim": 1, "focu": [7, 25, 26, 72], "focus": [12, 73, 77, 78, 93], "fold": 89, "folder": [0, 3, 6, 19, 75, 90, 91, 93], "folder_trt_llm": 16, "follow": [3, 6, 7, 10, 12, 14, 15, 16, 17, 19, 20, 25, 26, 27, 28, 29, 30, 32, 36, 47, 48, 51, 55, 56, 57, 65, 66, 67, 69, 73, 74, 75, 76, 77, 78, 79, 80, 82, 88, 90, 91, 93, 94, 95, 97, 98], "footprint": [5, 21, 28, 89], "for_each_rank": 84, "forc": [0, 5, 11, 26, 73], "force_drop_id": 83, "force_low_precision_all_reduce_strategi": 11, "force_multi_block_mod": 73, "force_nccl_all_reduce_strategi": 93, "force_num_profil": 70, "force_words_id": 6, "forecast": 12, "foretel": 51, "fork": 72, "form": [0, 3, 5, 12, 70, 82, 88], "format": [0, 3, 11, 15, 17, 19, 22, 25, 27, 28, 39, 64, 65, 69, 70, 71, 75, 77, 87, 88, 89, 92, 93, 97], "former": [16, 25, 51], "formula": [28, 82], "forum": 93, "forward": [0, 1, 7, 12, 14, 16, 27, 52, 81, 82, 83, 84, 92, 93, 95, 96, 97, 98, 99], "forward_loop": 73, "forward_with_cfg": 84, "forward_without_cfg": 84, "forwardasync": 1, "forwarddispatch": 1, "forwardsync": 1, "found": [3, 4, 5, 6, 7, 12, 16, 18, 22, 65, 67, 73, 75, 77, 80, 90, 99], "foundat": 27, "four": [3, 7, 12, 15, 26, 27, 83], "fourth": 3, "fp": [90, 93], "fp16": [5, 10, 11, 13, 15, 17, 21, 22, 25, 29, 64, 73, 77, 80, 82, 88, 91, 92, 93], "fp32": [0, 5, 26, 28, 29, 64, 70, 82, 87, 88, 91, 92, 93], "fp4": [20, 27, 28, 29, 93], "fp8": [11, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 46, 54, 59, 64, 70, 73, 78, 80, 82, 85, 89, 91, 93, 94, 97], "fp8_block_scal": 70, "fp8_blockscale_gemm": 93, "fp8_inputs_overrid": 82, "fp8_kv_cach": [5, 90], "fp8_per_channel_per_token": 70, "fp8_qdq": 90, "fp8_rowwise_gemm_plugin": 29, "fp_valu": 5, "fpa_intb": 93, "fraction": [0, 30, 70, 82, 83, 87], "framework": [12, 14, 15, 18, 19, 71, 82, 93], "franc": [14, 16, 40, 42, 43, 44, 46, 47, 48, 49, 50, 54, 59, 66, 67, 75, 81, 88, 92, 94], "free": [0, 1, 8, 10, 16, 17, 28, 30, 71, 79, 83, 84, 87, 89, 98], "free_gpu_memory_fract": [30, 36, 49, 53, 70, 81, 93], "free_resourc": [96, 98], "freed": 73, "freedom": 19, "freegpumemoryfract": [0, 89, 93], "freenumblock": 0, "freez": 28, "french": 88, "freq": 82, "frequenc": [73, 83], "frequency_penalti": [70, 87, 93], "frequencypenalti": [0, 1, 6], "frequent": [9, 92], "friend": [0, 1, 73], "friendli": 82, "from": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 23, 25, 26, 27, 28, 29, 30, 31, 32, 36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 66, 67, 69, 70, 71, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 84, 86, 87, 88, 89, 92, 93, 94, 95, 96, 97, 98, 99], "from_argu": 84, "from_checkpoint": [19, 84], "from_config": 84, "from_dict": [70, 84], "from_dir": 87, "from_engin": 87, "from_hugging_fac": [14, 17, 19, 84], "from_jax": 19, "from_json_fil": [70, 84], "from_kera": 19, "from_meta_ckpt": [19, 84], "from_nemo": [19, 84], "from_pretrain": 84, "from_prun": 84, "from_serialized_engin": 87, "from_str": 82, "fromfil": 16, "fruit": 28, "full": [0, 4, 5, 6, 9, 10, 12, 22, 23, 27, 28, 30, 70, 71, 72, 73, 76, 82, 87, 88, 89, 92], "full_lik": 52, "fulli": [28, 40, 93], "funcnam": 0, "function": [0, 1, 3, 5, 13, 14, 16, 18, 19, 26, 27, 69, 70, 71, 72, 80, 85, 87, 89, 90, 91, 92, 93, 98, 99], "functiont": 0, "further": [3, 4, 5, 12, 16, 21, 25, 27, 28, 29, 73, 77, 80, 97], "furthermor": [12, 26, 77], "fuse": [5, 12, 16, 26, 28, 29, 80, 82, 88, 93, 95, 97], "fuse_a": [26, 28], "fuse_fp4_qu": 29, "fuse_qkv_project": 84, "fuseattentionwithbiaspass": 7, "fused_gate_up_dora": 83, "fused_gate_up_lora": 83, "fused_mo": 70, "fusedgatedmlp": [82, 83], "fusevalu": 1, "fusion": [7, 28, 29, 64, 71, 79, 89, 90, 93, 97], "fusion_op": 82, "futur": [2, 5, 6, 8, 12, 17, 19, 25, 29, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 54, 59, 65, 66, 67, 69, 70, 71, 73, 75, 81, 82, 88, 89, 90, 93, 94], "fuyu": [91, 93], "g": [3, 8, 11, 17, 27, 28, 30, 52, 55, 56, 57, 70, 79, 87, 95], "g1": 79, "g2": 79, "gain": [76, 79], "gamma": 82, "gate": [10, 17, 29, 75, 82, 93], "gate_a": 82, "gate_a_bia": 82, "gate_bia": 82, "gate_proj": 17, "gate_x": 82, "gate_x_bia": 82, "gatedmlp": [82, 83], "gather": [0, 1, 29, 47, 48, 70, 82, 87], "gather_all_token_logit": [29, 93], "gather_context_logit": [29, 70, 84, 87], "gather_dim": [16, 82], "gather_generation_logit": [29, 70, 84, 87], "gather_last_token_logit": 82, "gather_nd": 82, "gather_output": 83, "gathercontext": [0, 93], "gatheredid": 1, "gatherel": 82, "gathergenerationlogit": 0, "gathermod": 82, "gathertre": 1, "gatherv2": 82, "gb": [2, 23, 28, 65, 70, 73], "gb200": [28, 93], "gcc": 65, "gd": 0, "gdrdma": 2, "geforc": 93, "gegelu": 82, "gegelu_limit": 83, "geglu": 82, "gelu": [82, 84], "gelu_pytorch_tanh": 93, "gelu_tanh": 83, "gemm": [7, 28, 29, 79, 82, 88, 89, 93], "gemm_allreduc": 82, "gemm_allreduce_plugin": [29, 87], "gemm_fc1": 26, "gemm_plugin": [10, 13, 15, 16, 29, 73, 77, 80, 83, 88], "gemm_swiglu": 82, "gemm_swiglu_plugin": [29, 77, 85], "gemma": [19, 69, 90, 91, 93], "gemma2": 91, "gemma2_added_field": 84, "gemma2_config": 84, "gemma3": 93, "gemma3_added_field": 84, "gemma3_config": 84, "gemma_added_field": 84, "gemma_config_kwarg": 84, "gemmaconfig": 84, "gemmaforcausallm": 84, "gen": [70, 93], "genai": [25, 30, 63], "genattent": 26, "genenginepath": 0, "gener": [0, 1, 3, 6, 9, 12, 15, 16, 17, 19, 20, 21, 22, 24, 26, 27, 28, 29, 40, 41, 42, 51, 64, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 80, 81, 82, 84, 87, 88, 89, 91, 92, 93, 94, 95, 96, 97, 98, 99], "generate_alibi_bias": 82, "generate_alibi_slop": 82, "generate_async": [36, 47, 48, 70, 93], "generate_logn_sc": 82, "generate_tllm_weight": 17, "generated_text": [40, 43, 44, 54, 58, 66, 67, 75, 81, 88, 94], "generatedtokensperenginestep": 1, "generation_complet": 99, "generation_in_progress": 99, "generation_logit": [53, 70, 87], "generation_onli": 70, "generation_phas": 5, "generation_request": 99, "generation_to_complet": 99, "generationexecutor": [2, 93], "generationlength": 1, "generationlengthsdevic": 1, "generationlengthshost": 1, "generationlengthshostcopi": 1, "generationlogit": 0, "generationmixin": 84, "generationrequestid": 2, "generationresult": 70, "generationsequ": 87, "generationsess": [5, 87, 89], "generationstep": 1, "genericprompttuningparam": 1, "genert": 2, "genexecutorconfig": 0, "genidx": 0, "genrequest": 1, "genrespons": 2, "get": [0, 1, 2, 3, 5, 7, 10, 13, 17, 24, 27, 28, 30, 31, 32, 36, 40, 41, 65, 66, 67, 70, 71, 72, 75, 77, 82, 84, 87, 88, 92, 93, 94, 99], "get_1d_sincos_pos_embed_from_grid": 83, "get_2d_sincos_pos_emb": 83, "get_2d_sincos_pos_embed_from_grid": 83, "get_audio_featur": 87, "get_batch_cache_indic": 98, "get_batch_idx": 87, "get_block_offset": 87, "get_buff": 98, "get_comm": 70, "get_config_group": 84, "get_context_phase_param": 70, "get_device_cap": 59, "get_first_past_key_valu": 83, "get_hf_config": 84, "get_input": 7, "get_kv_cache_ev": [51, 70], "get_kv_cache_events_async": 70, "get_max_resource_count": [98, 99], "get_needed_resource_to_complet": [98, 99], "get_next_medusa_token": 87, "get_num_free_block": 98, "get_num_heads_kv": 87, "get_output": [7, 16], "get_par": [7, 82], "get_pytorch_backend_config": 70, "get_request_typ": 70, "get_rope_index": 87, "get_seq_idx": 87, "get_shap": 17, "get_slic": 17, "get_stat": [70, 93], "get_stats_async": 70, "get_timestep_embed": 83, "get_us": [7, 82], "get_visual_featur": 87, "get_vocab": [0, 3], "get_weight": 83, "getacceptancethreshold": 0, "getacceptedlengthscumsum": 1, "getacceptedpackedpath": 1, "getadditionalmodeloutput": 0, "getadditionaloutputnam": 0, "getaddr": 0, "getaddress": 1, "getagentst": 0, "getallnewtoken": 1, "getallottedtimem": 0, "getattentionconfig": 0, "getbackend": 0, "getbackendagentdesc": 0, "getbadword": 0, "getbatchingtyp": 0, "getbatchsizet": 0, "getbeamsearchbuff": 1, "getbeamsearchdiversityr": 0, "getbeamwidth": 0, "getbeamwidtharrai": 0, "getbuffermanag": 1, "getcachest": 0, "getcachetransceiverconfig": 0, "getcapac": 1, "getcapacityschedulerpolici": 0, "getclientid": 0, "getcommptr": 1, "getcommst": 0, "getcommunicationmod": 0, "getcommunicationtyp": 0, "getconfig": 0, "getconnect": 0, "getconnectioninfo": 0, "getcontextchunkingpolici": 0, "getcontextexecutor": 0, "getcontextfmha": 1, "getcontextparallel": 1, "getcontextparallelgroup": 1, "getcontextparallelrank": 1, "getcontextphaseparam": 0, "getcopyonpartialreus": 0, "getcpu": 1, "getcpudiff": 1, "getcrossattentionmask": 0, "getcrosskvcachefract": 0, "getcudagraphcaches": 0, "getcudagraphmod": 0, "getcumlogprob": 1, "getdata": 0, "getdatatyp": [0, 1], "getdatatypenam": 1, "getdebugconfig": 0, "getdebuginputtensor": 0, "getdebugoutputtensor": 0, "getdebugtensornam": 0, "getdebugtensorsmaxiter": 0, "getdecodedurationm": 0, "getdecoderetentionprior": 0, "getdecoderst": 1, "getdecoderstream": 1, "getdecodingconfig": 0, "getdecodingmod": 0, "getdefaultbatchslot": 1, "getdefaulteaglechoic": 1, "getdesc": 0, "getdevic": 1, "getdevicecacheperc": 0, "getdeviceid": 0, "getdeviceof": 1, "getdimens": 1, "getdirectori": 0, "getdrafttoken": 0, "getdstdesc": 0, "getdynamicbatchconfig": 0, "getdynamicbatchmovingaveragewindow": 0, "getdynamictreemaxtopk": 0, "geteaglechoic": 0, "geteagleconfig": 0, "getearlystop": 0, "getembeddingbia": 0, "getembeddingt": 0, "getenablebatchsizetun": 0, "getenableblockreus": 0, "getenablechunkedcontext": 0, "getenablecontextfmhafp32acc": 0, "getenablemaxnumtokenstun": 0, "getenablepartialreus": 0, "getenabletrtoverlap": 0, "getencodedvocab": 0, "getencoderhiddens": 1, "getencoderinputfeatur": 0, "getencoderinputtokenid": 0, "getencoderoutputlength": 0, "getendid": 0, "geterrormsg": 0, "geteventbuffermaxs": 0, "getexecutionconfig": 1, "getextendedruntimeperfknobconfig": 0, "getexternaldrafttokensconfig": 0, "getfastlogit": 0, "getfinishedstep": 1, "getfinishedsum": 1, "getfinishreason": 1, "getfirstgentoken": 0, "getfirstlocallay": 1, "getfreegpumemoryfract": 0, "getfrequencypenalti": 0, "getfunctionpoint": 0, "getgatheredid": 1, "getgathergenerationlogit": 0, "getgemmallreducedtyp": 1, "getgenexecutor": 0, "getgpu": 1, "getgpudiff": 1, "getgpuspergroup": 1, "getgpuspernod": 1, "getgpuweightsperc": [0, 13], "getguid": 0, "getguideddecodingconfig": 0, "getguideddecodingparam": 0, "getguidetyp": 0, "gethandl": 0, "gethiddens": 1, "gethostcaches": 0, "gethostmemori": 1, "getid": 1, "getinittozero": 1, "getinputtokenextraid": 0, "getinputtokenid": 0, "getinst": [0, 1], "getipcunicastpoint": 1, "getisorchestr": 0, "getiterstatsmaxiter": 0, "getjointdecodinginput": 1, "getjointdecodingoutput": 1, "getkvcacheconfig": 0, "getkvcacheconfigref": 0, "getkvcacheeventmanag": 0, "getkvcacheretentionconfig": 0, "getkvcachetyp": 1, "getkvdatatyp": 1, "getlanguageadapteruid": 0, "getlastrank": 1, "getlatestdebugtensor": 0, "getlatestev": 0, "getlatestiterationstat": [0, 3], "getlatestrequeststat": 0, "getlayertyp": 1, "getlen": 0, "getlengthpenalti": 0, "getlevel": 1, "getlocalagentdesc": 0, "getlocalrank": 1, "getlogit": 0, "getlogitsdtyp": 1, "getlogitspostprocessor": 0, "getlogitspostprocessorconfig": 0, "getlogitspostprocessornam": 0, "getlogprob": 1, "getlookaheadconfig": 0, "getlookaheaddecodingconfig": 0, "getlookaheaddecodingmaxnumrequest": 0, "getloraconfig": 0, "getloramodul": 1, "getloraprefetchdir": 0, "getmanagedweightsmapopt": 1, "getmanageweightstyp": 1, "getmaxadapters": 0, "getmaxattentionwindowvec": 0, "getmaxbatchs": [0, 1], "getmaxbeamwidth": [0, 1], "getmaxdecodingdecodertoken": 1, "getmaxdecodingdrafttoken": 1, "getmaxdecodingenginetoken": 1, "getmaxdecodingtoken": 1, "getmaxdraftpathlen": 1, "getmaxencoderlen": 1, "getmaxinputlen": 1, "getmaxlorarank": 1, "getmaxnonleafnodesperlay": 1, "getmaxnumpath": 1, "getmaxnumtoken": [0, 1], "getmaxpagesperblock": 1, "getmaxpagesperblockdevic": 0, "getmaxpagesperblockhost": 0, "getmaxpathlen": 1, "getmaxpositionembed": 1, "getmaxpromptembeddingtables": 1, "getmaxqueues": 0, "getmaxseqidlemicrosecond": 0, "getmaxsequencelen": 1, "getmaxsequencelength": 1, "getmaxtoken": 0, "getmedusachoic": [0, 1], "getmemorytyp": [0, 1], "getmemorytypenam": 1, "getminp": 0, "getmintoken": 0, "getmlphiddens": 1, "getmodelconfig": [0, 1], "getmodelconfigmut": 1, "getmodelnam": 1, "getmodelvari": 1, "getmpist": 0, "getmropeconfig": 0, "getmropepositiondelta": 0, "getmroperotarycossin": 0, "getmultiblockmod": 0, "getmulticastpoint": 1, "getmultimodalembed": 0, "getnam": [0, 1], "getnbattentionlay": 1, "getnbhead": 1, "getnbkvhead": 1, "getnblay": 1, "getnbrnnlay": 1, "getnextdrafttoken": 1, "getnextdrafttokenslength": 1, "getngrams": 0, "getnoderank": 1, "getnoderankof": 1, "getnorepeatngrams": 0, "getnormalizelogprob": 0, "getnotifiedsyncmessag": 0, "getnumcopystream": [0, 1], "getnumdecodingenginetoken": 1, "getnumdevicemodulelay": 0, "getnumensurework": 0, "getnumhostmodulelay": 0, "getnumkvheadsperlay": 1, "getnumkvheadsperlayerlocalrang": 1, "getnumlanguag": 1, "getnumnod": 0, "getnumpackedmask": 1, "getnumpag": 1, "getnumputwork": 0, "getnumresponsesreadi": 0, "getnumreturnbeam": [0, 1], "getnumreturnsequ": 0, "getnumtransformerlay": 1, "getonboardblock": 0, "getop": 0, "getoptimaladapters": 0, "getoptprofilessplitpoint": 1, "getorchestratorconfig": 0, "getorchleadercomm": 0, "getoutputconfig": 0, "getpadid": 0, "getpagedcontextfmha": 1, "getpageptr": 1, "getpagewidth": 1, "getparallelconfig": 0, "getparentid": 1, "getparticipantid": 0, "getpath": 1, "getpathopt": 1, "getpeftcacheconfig": 0, "getperblockretentionprioritydur": 0, "getpin": 1, "getpinneddiff": 1, "getpinnedpool": 1, "getpinnedpooldiff": 1, "getpipelineparallel": 1, "getpipelineparallelgroup": 1, "getpipelineparallelrank": 1, "getpositionid": 0, "getposteriorthreshold": 0, "getppreducescatt": 1, "getprecis": 1, "getpresencepenalti": 0, "getprevdrafttokenslength": 1, "getprior": 0, "getprocessorbatch": 0, "getprocessormap": 0, "getprompttableoffload": 0, "getprompttuningconfig": 0, "getquantmod": 1, "getrank": 1, "getrecvpollperiodm": 0, "getremotenam": 0, "getrepetitionpenalti": 0, "getrepl": 0, "getreqid": 0, "getrequestid": 0, "getrequeststatsmaxiter": 0, "getrequesttyp": 0, "getresult": [0, 2, 3], "getreturnallgeneratedtoken": 0, "getrnnconfig": 1, "getrotaryembeddingdim": 1, "getruntimedefault": 1, "getruntimetyp": 0, "getsamplingconfig": [0, 1], "getschedulerconfig": 0, "getschedulerconfigref": 0, "getse": 0, "getsecondaryoffloadminprior": 0, "getselfidx": 0, "getsequencelength": 1, "getserializedst": 0, "getshap": [0, 1], "getsinktokenlength": 0, "getsiz": [0, 1], "getsizeinbit": 1, "getsizeinbyt": [0, 1], "getsizeperhead": 1, "getskipcrossattnblock": 0, "getslotsperpag": 1, "getsocketst": 0, "getspawnprocess": 0, "getspecdecconfig": 0, "getspeculativedecodingmod": 1, "getspeculativedecodingmodul": 1, "getspeculativedecodingmoduleptr": 1, "getsrcdesc": 0, "getstat": 0, "getstatu": 1, "getstoptokenid": 0, "getstopword": 0, "getstream": [0, 1], "getsumlocalkvhead": 1, "getsyncmessag": 0, "gettag": 0, "gettaskid": 0, "gettemperatur": 0, "gettensorparallel": 1, "gettensorparallelgroup": 1, "gettensorparallelrank": 1, "getter": 6, "gettoken": 0, "gettokenizerstr": 0, "gettokenrangeretentionconfig": 0, "gettokensperblock": 1, "gettopk": 0, "gettopp": 0, "gettoppdecai": 0, "gettoppmin": 0, "gettoppresetid": 0, "gettotalnumpag": 1, "gettransfermod": 0, "gettyp": [0, 1], "getunderlyingdecod": 1, "getunicastpoint": 1, "getusegpudirectstorag": 0, "getuvm": 1, "getuvmdiff": 1, "getverificationsets": 0, "getvers": 1, "getvocabs": 1, "getvocabsizepad": 1, "getweight": 0, "getwindows": 0, "getworkerexecutablepath": 0, "getworlds": 1, "gh200": 93, "ghost": 51, "ghz": 45, "gib": [9, 89], "gid": 0, "gigabyt": 23, "git": [10, 20, 65, 69, 88, 92, 94], "github": [19, 20, 26, 65, 69, 71, 88, 93, 94], "give": [3, 27, 28, 71, 77, 79, 84], "given": [0, 1, 3, 6, 10, 17, 19, 23, 69, 70, 72, 78, 79, 82, 83, 84, 86, 87, 89, 90, 93, 98], "givyboi": 58, "glm": [69, 82, 91, 93], "glm4": [69, 93], "global": [0, 5, 8, 16, 26, 28, 93], "global_max_input_length": 87, "global_max_output_length": 87, "globalrequestid": 0, "glossari": [21, 24], "gm": 92, "gnu": 65, "go": [5, 6, 51, 76, 93], "goal": 81, "goe": [27, 69, 73], "good": [3, 16, 20, 28, 73, 76, 79, 80], "got": [0, 42, 45, 46, 47, 48, 49, 50, 51, 52, 58, 59, 69, 73, 92], "gpqa": [26, 28], "gpt": [1, 5, 12, 16, 18, 22, 25, 29, 64, 69, 73, 82, 89, 90, 91, 92, 93], "gpt2": [84, 92], "gpt3": 23, "gpt_attent": [5, 7, 24, 82, 88, 93], "gpt_attention_plugin": [10, 16, 29, 73, 83, 87, 92, 93], "gpt_attention_plugin_remove_pad": 7, "gpt_variant": [84, 93], "gptattent": 7, "gptattentionpluginremovepaddingrewritepass": 7, "gptconfig": 84, "gptdecod": 6, "gptdecoderbatch": 93, "gptdecoderptr": 1, "gptforcausallm": 84, "gptj": 84, "gptjconfig": 84, "gptjforcausallm": 84, "gptjmodel": 84, "gptlmheadmodel": 92, "gptmanag": 93, "gptmanagerbenchmark": [9, 65, 93], "gptmodel": 84, "gptmodelconfig": 93, "gptneoxforcausallm": 84, "gptneoxmodel": 84, "gptq": [25, 64, 91, 93], "gptsession": 93, "gptsessionbenchmark": 93, "gpu": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 15, 18, 19, 22, 23, 24, 25, 27, 29, 30, 36, 55, 56, 57, 59, 64, 65, 67, 69, 70, 74, 75, 76, 77, 80, 82, 84, 87, 88, 91, 92, 93, 96, 97], "gpu_weights_perc": [13, 87], "gpudirect": 0, "gpumemusag": [0, 30], "gpus_per_nod": [29, 30, 70], "gpuspernod": [1, 6], "gpusync": 1, "gpuweightsperc": [0, 13], "gqa": [5, 8, 21, 24, 29, 82, 93, 97], "grace": [9, 64, 91], "gradient": 22, "gradual": 19, "grain": 7, "gram": 12, "grammar": [0, 3, 70], "granit": [91, 93], "graph": [0, 16, 20, 27, 28, 64, 70, 72, 73, 82, 87, 88, 89, 92, 93, 97, 98], "graph_rewrit": 7, "graphic": 53, "gratitud": 27, "gre": 30, "great": [21, 53], "greater": [0, 2, 5, 24, 25, 26, 29, 82], "greatli": [9, 19, 27, 77, 80], "greedi": [0, 6, 96], "greedy_sampl": [43, 44, 70], "greedysampl": 0, "greedysamplinghost": 1, "grid": [16, 77, 79, 82, 83], "grid_search_engin": 75, "grid_siz": 83, "grok": [91, 93], "ground": 72, "groundbreak": 71, "group": [0, 3, 4, 6, 8, 16, 21, 28, 64, 70, 82, 83, 90, 93, 97], "group_cl": 84, "group_norm": 82, "group_siz": [15, 70, 82], "groupedrmsnorm": 26, "groupgemm": 28, "groupnorm": [82, 83], "grow": [1, 12, 79], "gsm8k": 28, "gt": 82, "gtc": [20, 26], "guarante": [0, 6, 9, 19, 73, 74, 75, 77, 81], "guaranteed_no_evict": [0, 70, 73, 81], "guaranteednoevictschedul": 99, "guard": [51, 75], "guid": [0, 16, 20, 25, 40, 41, 64, 69, 70, 71, 72, 74, 75, 76, 77, 80, 82, 92, 93, 97], "guidanc": [12, 30, 80, 83, 84], "guided_decod": [45, 70], "guided_decoding_backend": [45, 70], "guideddecodingbackend": 0, "guideddecodingconfig": [0, 3], "guideddecodingparam": [0, 3, 45, 70], "guidelin": [2, 76], "guidetyp": [0, 3], "gw": 7, "h": [2, 3, 5, 12, 17, 27, 29, 30, 33, 34, 35, 75, 82, 84, 88, 93], "h0": 27, "h1": 82, "h100": [19, 25, 29, 71, 74, 75, 77, 78, 79, 93], "h20": 29, "h200": [22, 29, 74, 93], "h2d": 52, "ha": [0, 1, 3, 5, 9, 10, 11, 15, 16, 17, 19, 20, 21, 25, 26, 27, 28, 29, 32, 65, 70, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 87, 89, 90, 92, 93, 96, 98, 99], "had": [19, 28, 77, 79], "half": [0, 1, 16, 28, 75, 82], "halv": [22, 82], "hand": [9, 12, 18, 76], "handl": [0, 1, 2, 4, 8, 17, 19, 21, 26, 75, 77, 79, 80, 81, 82, 83, 95, 96], "handle_per_step": 87, "hang": [0, 69, 92, 93], "happen": [3, 6, 9, 16, 67, 89, 92], "happi": 87, "har": 28, "hard": [5, 70], "harder": 6, "hardwar": [8, 25, 28, 36, 64, 65, 93], "has_affin": 82, "has_bia": 82, "has_config_group": 84, "has_position_embed": 87, "has_scal": 82, "has_token_type_embed": 87, "has_zero_point": [15, 70], "hascontextawaitthread": 0, "hasdraftlogit": 1, "haserror": [0, 3], "hasgenawaitthread": 0, "hash": [0, 70], "hasresult": 0, "hasrnnconfig": 1, "hasspeculativedecodingmodul": 1, "hattizai": 93, "have": [0, 1, 3, 4, 5, 6, 9, 10, 12, 15, 16, 17, 19, 20, 21, 23, 25, 26, 27, 28, 29, 31, 51, 54, 55, 56, 57, 58, 67, 69, 70, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 82, 87, 88, 89, 91, 92, 93, 95], "hbm3": 74, "hbm3e": 23, "he": 51, "head": [1, 6, 8, 12, 16, 21, 27, 28, 29, 54, 59, 64, 73, 82, 83, 93, 97], "head_dim": [97, 98], "head_siz": [5, 82, 84, 87, 93], "header": 2, "headsiz": 82, "headsperlay": 1, "health": [30, 58], "heat": 6, "heavi": 80, "heavier": 76, "height": [39, 83, 87], "hello": [40, 42, 43, 44, 46, 47, 48, 49, 50, 52, 54, 55, 58, 59, 66, 67, 75, 81, 88, 94], "help": [2, 3, 5, 7, 16, 26, 27, 29, 30, 33, 34, 45, 52, 54, 60, 61, 65, 72, 73, 74, 75, 78, 79, 80, 81, 82, 88, 93, 96], "helper": [1, 82], "henc": 95, "here": [2, 3, 7, 10, 13, 14, 15, 16, 17, 19, 20, 22, 23, 27, 28, 30, 32, 36, 40, 45, 65, 72, 75, 76, 77, 79, 80, 82, 87, 88, 89, 90, 92, 94, 97, 98, 99], "heterogen": 2, "heurist": [5, 28, 73, 82, 93], "hf": [6, 10, 13, 17, 29, 30, 46, 47, 48, 49, 50, 54, 55, 56, 57, 59, 73, 74, 75, 87, 91, 92, 94], "hf_config_or_dir": 84, "hf_lora_convert": 10, "hf_model": [73, 84], "hf_model_dir": [13, 14, 15, 19, 84], "hf_model_nam": 73, "hf_model_or_dir": 84, "hf_quant_config": 73, "hf_token": 73, "hfconfigordir": 84, "hgx": 23, "hi": 10, "hidden": [0, 3, 4, 5, 6, 10, 12, 26, 27, 70, 82, 83, 93], "hidden_act": [15, 83, 84], "hidden_dim": [0, 5, 82], "hidden_dim_per_head": [5, 82], "hidden_dtyp": 83, "hidden_s": [0, 7, 15, 17, 82, 83, 84, 87, 95, 97], "hidden_size_in": 10, "hidden_size_out": 10, "hidden_size_per_head": 82, "hidden_st": [14, 82, 83, 84, 87, 92, 95], "hidden_states_for_emb": 84, "hiddens": [0, 1, 6], "hide": [26, 28], "hierarch": 15, "hierarchi": [19, 64, 82], "high": [3, 12, 14, 16, 19, 21, 25, 26, 27, 28, 69, 73, 81, 82, 89, 93], "higher": [0, 1, 5, 6, 9, 10, 12, 17, 21, 22, 24, 28, 71, 74, 81, 89, 93, 95], "highest": [6, 7, 22, 23], "highli": [12, 16, 28, 72, 77], "highlight": [22, 25, 77, 79], "himself": 51, "hin": 27, "hint": [73, 82], "histori": 28, "hit": [0, 28, 70, 74, 79, 80, 93], "hk": 12, "ho": 10, "hoc": [19, 87], "hold": [0, 1, 3, 4, 7, 8, 9, 10, 12, 70, 76, 83, 89, 96], "home": [20, 59, 73], "homo_head_pattern": 83, "homogen": 2, "hope": 26, "hopper": [5, 9, 20, 21, 22, 25, 27, 28, 29, 64, 65, 71, 77, 91, 93], "horatio": 51, "horizont": [28, 29], "host": [1, 10, 28, 30, 32, 37, 52, 57, 64, 65, 70, 80, 82, 93], "host_cache_s": 70, "host_context_length": [82, 83, 84, 87, 92], "host_context_progress": [82, 83, 92], "host_cross_kv_cache_block_offset": [83, 87], "host_cross_kv_cache_pool_map": 83, "host_cross_kv_cache_pool_point": 83, "host_kv_cache_block_offset": [82, 83, 87, 92], "host_kv_cache_block_point": 92, "host_kv_cache_pool_map": [82, 83, 92], "host_kv_cache_pool_point": [82, 83, 92], "host_max_attention_window_s": [82, 83, 92], "host_past_key_value_length": [82, 83, 92], "host_request_typ": [82, 83, 84, 92], "host_runtime_perf_knob": [82, 83, 92], "host_sink_token_length": [82, 83, 92], "hostcaches": [0, 9], "hostmemori": 1, "hostnam": 30, "hour": 75, "hous": 76, "how": [0, 2, 3, 12, 14, 16, 17, 19, 29, 32, 40, 55, 64, 69, 70, 72, 75, 77, 78, 80, 82, 88, 89, 90, 92, 94, 96, 97], "howev": [2, 3, 5, 12, 19, 20, 21, 26, 27, 28, 30, 73, 76, 77, 79, 80, 81, 89, 93, 95, 96], "hpc": 22, "html": [1, 82, 92], "http": [0, 1, 4, 10, 19, 20, 26, 29, 30, 33, 34, 35, 60, 61, 62, 65, 66, 67, 69, 82, 88, 90, 92, 93, 94], "hub": [18, 58, 70, 73, 88, 93, 94], "hug": [3, 10, 13, 18, 19, 36, 70, 73, 84, 88, 93], "huggingfac": [0, 10, 14, 15, 17, 19, 20, 34, 58, 61, 69, 73, 74, 75, 88, 91, 92, 93, 95], "huggingface_exampl": 94, "huggingface_hub": 58, "huggingface_model_card": 94, "human": [26, 73], "hurt": [28, 80], "hw": [26, 28], "hybrid": [4, 93], "hyper": 15, "hypothesi": 12, "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 35, 36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 62, 65, 66, 67, 69, 70, 71, 73, 74, 75, 77, 78, 79, 80, 81, 82, 83, 84, 85, 87, 88, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], "ia3": 5, "iactivationlay": 16, "ibrahimamin1": 93, "ibufferptr": 1, "iconstantlay": 82, "icudaengin": [87, 89], "id": [0, 1, 3, 9, 27, 36, 48, 70, 73, 74, 82, 83, 87, 88, 97, 98], "idea": [10, 28, 80], "ideal": [7, 77, 79, 93], "ident": [3, 9, 28, 29, 82], "identifi": [0, 6, 10, 12, 16, 73, 79, 82], "idl": 0, "idtyp": [0, 3], "idx": 87, "ieee": 90, "ieinsumlay": 82, "ielementwiselay": 82, "iexecutioncontext": [87, 89], "ifb": [12, 93], "ifilllay": 82, "igatherlay": 82, "ignor": [29, 70, 73, 82, 87], "ignore_eo": [70, 93], "igptdecod": 1, "ihostmemori": [1, 16, 87], "ii": [5, 82], "ij": 82, "ijk": 82, "ijl": 82, "ik": 82, "ikl": 82, "ilay": [7, 16], "illustr": [7, 12, 18, 26, 27], "ilogg": 1, "ilooplay": 82, "imag": [30, 34, 39, 55, 56, 57, 61, 64, 66, 67, 73, 83, 87, 93], "image64": 61, "image_grid_thw": 87, "image_patches_indic": 87, "image_path": 87, "image_s": 84, "image_token_index": 87, "image_url": [34, 61], "imatrixmultiplylay": 82, "imbal": 79, "immedi": [5, 12, 71, 75, 92], "immut": 1, "impact": [11, 12, 21, 25, 26, 27, 28, 30, 58, 76, 77, 79, 80, 81], "imped": 25, "impl": [0, 99], "implement": [2, 3, 5, 6, 8, 12, 15, 16, 18, 19, 21, 28, 52, 64, 71, 82, 83, 88, 90, 91, 92, 93, 95, 96, 98, 99], "implicit": [1, 5, 12, 82], "implicitli": 1, "import": [11, 12, 17, 19, 21, 25, 27, 28, 30, 36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 58, 59, 60, 61, 62, 64, 66, 67, 75, 77, 79, 80, 81, 88, 91, 93, 94, 95, 96, 98], "impos": 25, "improv": [5, 9, 11, 16, 21, 22, 23, 24, 25, 26, 27, 28, 29, 42, 46, 47, 49, 50, 64, 71, 73, 74, 75, 77, 78, 79, 80, 93, 94, 97], "in_channel": 83, "in_featur": [15, 16, 83], "in_hidden_s": 82, "in_len": 7, "in_point": 82, "in_progress": 87, "includ": [0, 1, 2, 3, 5, 6, 9, 10, 12, 15, 16, 17, 18, 21, 22, 24, 27, 28, 29, 30, 36, 45, 51, 59, 65, 67, 69, 70, 71, 77, 80, 82, 88, 90, 92, 93, 96, 97, 98, 99], "include_stop_str_in_output": 70, "inclus": 82, "incompat": [29, 93, 94], "incorpor": [0, 26, 71, 93], "incorrect": [9, 12, 93], "increas": [0, 5, 9, 12, 16, 20, 22, 23, 26, 27, 28, 29, 72, 73, 75, 77, 80, 81, 82, 93, 99], "incred": 71, "increment": [65, 93], "incur": [16, 26], "inde": 89, "independ": [0, 1, 2, 3, 12, 82], "index": [0, 1, 3, 8, 12, 17, 26, 36, 53, 64, 66, 67, 70, 82, 87, 88, 93, 97], "index_select": 82, "indic": [0, 1, 3, 5, 6, 12, 15, 70, 81, 82, 83, 87, 89, 98], "indim": 1, "indimfirst": 1, "indirect": 1, "individu": [26, 93], "indivis": 93, "inductor": 70, "industri": 73, "ineffici": [5, 26], "inetworkdefinit": [7, 16, 82], "inevit": 16, "inf": 52, "infeas": 3, "infer": [0, 2, 6, 10, 12, 16, 18, 19, 20, 21, 22, 23, 26, 28, 29, 34, 61, 64, 69, 72, 74, 75, 76, 77, 78, 80, 81, 82, 87, 90, 92, 93, 96], "infer_shap": 87, "inferencerequest": 93, "infin": 32, "infinit": [16, 73, 74], "inflat": 26, "inflight": [0, 5, 10, 12, 30, 68, 70, 73, 78, 79, 82, 93, 97, 99], "inflight_request_id": 99, "inflightbatch": 0, "inflightbatchingstat": [0, 30], "influenc": [26, 80], "info": [0, 29, 30, 73, 89, 92], "inform": [0, 1, 2, 3, 5, 6, 8, 12, 15, 16, 21, 24, 26, 30, 64, 71, 73, 75, 91, 92, 93], "infti": 6, "inherit": [17, 19, 82, 95, 96, 98, 99], "init": [1, 20, 28, 65, 93], "init_audio_encod": 87, "init_image_encod": 87, "init_llm": 87, "init_processor": 87, "init_token": 87, "initi": [1, 2, 12, 17, 26, 52, 70, 73, 77, 79, 80, 89, 92, 93, 95, 97, 99], "initializer_list": [0, 1], "initmemorypool": 89, "inittozero": 1, "inlin": [0, 1], "inner": 82, "inner_layernorm": [83, 84], "innov": 28, "inp": 82, "inpaint": [34, 61], "inprogress": 1, "input": [0, 1, 3, 6, 7, 9, 10, 11, 12, 16, 17, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 36, 38, 39, 56, 61, 64, 69, 70, 72, 73, 74, 75, 76, 78, 80, 81, 82, 83, 84, 87, 89, 91, 92, 93, 95, 96, 97, 99], "input_1": 82, "input_1_": 82, "input_audio": 87, "input_featur": 84, "input_fil": 93, "input_id": [9, 14, 26, 73, 82, 84, 87, 92, 95], "input_imag": 87, "input_layernorm": [14, 15, 17, 95], "input_length": [82, 83, 84, 87], "input_list": 82, "input_n": 82, "input_n_": 82, "input_text": [14, 16, 87, 88], "input_timing_cach": [29, 70], "input_token_extra_id": 87, "inputbuff": 1, "inputdesc": 16, "inputdtyp": 1, "inputgentokenshost": 1, "inputlen": 1, "inputpack": [1, 6], "inputs_emb": 95, "inputtokenextraid": 0, "inputtokenid": 0, "insert": [7, 16, 73, 82], "insertinputtensor": 1, "insid": [1, 12, 17, 19, 20, 27, 28, 65, 67, 82, 89, 97], "insight": 26, "insiz": 1, "inspect": [29, 72, 89], "inspir": 27, "instabl": 2, "instal": [19, 30, 31, 55, 56, 57, 65, 69, 75, 88, 93, 95], "instanc": [0, 2, 3, 6, 7, 8, 12, 16, 26, 36, 52, 69, 70, 87, 89, 93, 97], "instance_idx": 92, "instanti": [75, 81, 98], "instead": [7, 9, 12, 16, 19, 20, 21, 36, 65, 70, 80, 81, 82, 89, 93], "instruct": [12, 20, 28, 30, 34, 39, 46, 61, 65, 73, 74, 75, 76, 80, 81, 88, 91, 93, 94, 95], "instrument": 28, "int": [0, 1, 6, 14, 15, 16, 19, 48, 52, 70, 79, 82, 83, 84, 87, 95, 97, 98, 99], "int32": [1, 5, 29, 82, 85, 92], "int32_t": [0, 1, 82], "int4": [17, 19, 25, 29, 36, 59, 64, 91, 93], "int4_weight": 90, "int64": [1, 6, 82, 92], "int64_t": [0, 1], "int8": [1, 15, 17, 19, 25, 29, 64, 70, 77, 82, 89, 91, 93], "int8_kv_cach": [5, 90, 93], "int8_t": [0, 1], "int8_weight": 90, "int8awq": 77, "int_clip": 82, "integ": [5, 70, 73, 82, 90, 93], "integr": [12, 93, 96, 97, 98, 99], "intellig": 71, "intend": 89, "intens": 28, "intent": 75, "intention": 19, "intenum": 82, "inter": [2, 75, 76, 77, 79, 80, 92, 93], "inter_layernorm": 84, "inter_s": 17, "interact": [3, 12, 71, 88, 92], "interchang": [8, 69], "interconect": 76, "interconnect": [6, 75, 76, 77, 79, 80], "interest": 73, "interfac": [16, 19, 75, 87, 93, 95, 96], "interfer": 92, "interleav": [5, 16, 28], "intermedi": [5, 16, 28, 70, 92], "intermediate_s": [15, 84], "intern": [1, 3, 5, 8, 19, 20, 26, 28, 75, 78, 89, 92, 98], "internal_error": [29, 30], "internlm": [69, 90, 91, 93], "internlm2": [90, 91, 93], "internvl2": 93, "interpol": 82, "interpolation_scal": 83, "interpret": [3, 65, 79], "intersect": 2, "intertwin": 80, "intflag": [84, 86], "intpsplitdim": 1, "intra": 76, "introduc": [19, 22, 26, 27, 32, 90, 93], "introduct": [78, 88, 93], "intuit": [28, 71, 78], "inv": 82, "inv_freq": 82, "invalid": [92, 93], "invalidateremoteag": 0, "inventori": 73, "invers": 5, "invest": 73, "investig": [20, 93], "invit": 59, "invoc": 93, "invok": [0, 3, 7, 69, 92, 99], "invokequant": 16, "involv": [0, 1, 2, 12, 16, 25, 27, 28, 83, 96, 97, 98], "io": [5, 31, 32, 89, 93], "ip": [0, 93], "ipc": 65, "ipc_uc_handl": 1, "ipc_uc_ptr": 1, "ipc_uc_va": 1, "ipcmemori": 1, "ipcnvl": 1, "ipcnvlsalloc": 1, "ipcnvlsfre": 1, "ipcnvlshandl": 1, "ipcnvlssupport": 1, "ipluginv3lay": 82, "ireducelay": 82, "irrespect": [0, 6, 52, 70], "is_alibi": 82, "is_caus": 83, "is_const_v": 1, "is_cuda_graph": 97, "is_def": 82, "is_dora": 10, "is_dynam": 82, "is_enc_dec": 87, "is_expert": 83, "is_gated_activ": 82, "is_gemma_2": 84, "is_gemma_3": 84, "is_keep_al": 70, "is_loc": 83, "is_medusa_mod": 87, "is_mla_en": 82, "is_mla_enabled_flag": 82, "is_module_excluded_from_quant": 70, "is_mrop": 82, "is_network_input": 82, "is_orchestrator_mod": 87, "is_public_pool": 70, "is_qkv": 83, "is_redrafter_mod": 87, "is_rop": 82, "is_trt_wrapp": 82, "is_use_oldest": 70, "is_valid": 83, "is_valid_cross_attn": 83, "isagentst": 0, "isauto": 0, "isbeamsearch": 0, "iscomplet": 0, "iscontextparallel": 1, "iscontinuouskvcach": 1, "iscrossattent": 1, "isdon": 1, "isdora": 1, "isdrafttokensextern": 1, "iseagl": [0, 1], "iselectlay": 82, "isexplicitdrafttoken": [0, 1], "isexternaldrafttoken": 0, "isfin": [0, 3], "isfirstcontextparallelrank": 1, "isfirstpipelineparallelrank": 1, "isfirsttensorparallelrank": 1, "isgreedysampl": 0, "ishufflelay": 82, "iskvcacheen": 1, "isl": [0, 21, 22, 23, 24, 26, 27, 28, 73, 74, 80], "islastpipelineparallelrank": 1, "isleg": 0, "islicelay": 82, "isload": 1, "islookahead": 0, "islookaheaddecod": 1, "ismedusa": [0, 1], "ismpist": 0, "ismultimod": 1, "isn": 92, "isnon": 1, "isoftmaxlay": 82, "isorchestr": 0, "ispagedkvcach": 1, "isparticip": [0, 93], "ispipelineparallel": 1, "ispoint": 1, "isrnnbas": 1, "issequencefin": [0, 3], "issocketst": 0, "issu": [5, 16, 19, 58, 64, 65, 67, 69, 73, 74, 75, 82, 92], "istensorparallel": 1, "isthreadsaf": 0, "istopk": 0, "istopkandtopp": 0, "istopkortopp": 0, "istopp": 0, "istransformerbas": 1, "istream": [0, 1], "isunsign": 1, "isusebantoken": 0, "isusebanword": 0, "isuseexpliciteosstop": 0, "isusefrequencypenalti": 0, "isusemaxlengthstop": 0, "isuseminlength": 0, "isuseminp": 0, "isusenorepeatngrams": 0, "isuseoccurrencepenalti": 0, "isusepenalti": 0, "isusepresencepenalti": 0, "isuserepetitionpenalti": 0, "isusestopcriteria": 0, "isusestopword": 0, "isusetemperatur": 0, "isusevariablebeamwidthsearch": 0, "iswhisp": 1, "ite": 87, "item": [0, 3, 28, 87], "itensor": [0, 82], "itensorbind": 1, "itensorptr": 1, "iter": [0, 1, 3, 5, 12, 17, 26, 27, 30, 70, 71, 73, 75, 79, 80, 81, 87, 93], "iter_stats_max_iter": 70, "iterationresult": 70, "iterationstat": 0, "iterationtyp": 0, "iterlatencym": [0, 30], "iterlatencymillisec": 93, "iterstat": 0, "iterstatsmaxiter": 0, "iterstatsvec": 0, "ith": 82, "itl": [77, 80, 93], "its": [0, 1, 3, 5, 6, 7, 8, 13, 15, 16, 17, 19, 21, 23, 26, 27, 45, 69, 71, 73, 76, 78, 79, 80, 82, 89, 96, 97, 99], "itself": [3, 28, 87], "itsuji": 73, "iunarylay": 82, "j": [5, 6, 22, 25, 27, 55, 56, 57, 69, 73, 82, 90, 91, 93], "jacobi": 12, "jai": 93, "jamesthez": 93, "jane": 59, "janpetrov": 93, "japanes": [10, 73], "jax": [15, 19], "ji": 82, "jit": [20, 67, 93], "jj": 82, "jk": 82, "jl749": 93, "job": [16, 56, 57], "joint": 28, "joint_attention_kwarg": 84, "joint_attn_forward": 83, "journei": [26, 71], "jpg": 73, "json": [0, 1, 3, 15, 30, 33, 34, 35, 38, 39, 45, 52, 70, 72, 73, 88, 93], "json_object": 70, "jsonconfigstr": 0, "jsonl": 73, "jsonseri": 0, "just": [0, 1, 12, 27, 28, 55, 56, 57, 58, 67, 73, 75, 81, 87, 89], "justic": [42, 46, 47, 49, 50, 58], "k": [1, 5, 6, 10, 12, 18, 26, 27, 28, 70, 82, 90, 92, 93, 95, 97], "k_b_proj_tran": 82, "k_dim": 82, "k_proj": [17, 95], "kattent": 1, "kattn_dens": 1, "kattn_k": 1, "kattn_q": 1, "kattn_qkv": 1, "kattn_v": 1, "kauto": 0, "kbatchedpostprocessornam": [0, 3], "kbeamsearch": 0, "kbf16": 0, "kblk": 0, "kbool": [0, 1], "kbyte_typ": 1, "kc_cache_retention_config": 93, "kcancel": 0, "kchatglm": 1, "kcontext": 1, "kcontext_in_progress": 0, "kcontinu": 1, "kcpu": [0, 1], "kcpu_pin": 0, "kcpu_pinnedpool": 0, "kcross_attn_dens": 1, "kcross_attn_k": 1, "kcross_attn_q": 1, "kcross_attn_qkv": 1, "kcross_attn_v": 1, "kdatatyp": 1, "kdecoder_onli": [0, 13], "kdefault": 0, "kdefault_num_tokens_per_block": 1, "kdefaultbatchsizet": 0, "kdefaultdynamicbatchmovingaveragewindow": 0, "kdefaultgpuspernod": 1, "kdefaultiterstatsmaxiter": 0, "kdefaultlookaheaddecodingngram": 0, "kdefaultlookaheaddecodingverificationset": 0, "kdefaultlookaheaddecodingwindow": 0, "kdefaultmaxadapters": 0, "kdefaultmaxpagesperblockdevic": 0, "kdefaultmaxpagesperblockhost": 0, "kdefaultmaxseqidlemicrosecond": 0, "kdefaultoptimaladapters": 0, "kdefaultprior": 0, "kdefaultrequeststatsmaxiter": 0, "kdefaultretentionprior": 0, "kdisabl": 1, "kdrafttokensextern": 1, "kdram": 0, "kdynamicpostprocessornameprefix": 0, "keagl": [0, 1], "kebnf_grammar": [0, 3], "keep": [0, 5, 6, 19, 26, 28, 70, 74, 81, 82, 93], "keepdim": 82, "kei": [0, 2, 3, 9, 16, 21, 25, 27, 28, 64, 73, 74, 79, 84, 87, 92, 96, 97, 98], "kenabl": 1, "kencdec": 1, "kencoder_decod": 0, "kencoder_in_progress": 0, "kencoder_onli": 0, "kend_id": 0, "kept": [5, 19, 70, 82], "kequal_progress": 0, "kera": 19, "kernel": [1, 5, 9, 16, 21, 27, 28, 29, 52, 67, 71, 72, 77, 80, 82, 87, 88, 89, 92, 93], "kernel_s": [82, 83], "kexplicitdrafttoken": [0, 1], "kexternaldrafttoken": 0, "key_length": [82, 83], "keyvaluecacheparam": [83, 84], "keyword": [17, 70, 82, 89], "kfile": 0, "kfirst_come_first_serv": 0, "kfloat": [1, 16], "kfp16": 0, "kfp32": [0, 70], "kfp8": 0, "kgener": 1, "kgeneration_complet": 0, "kgeneration_in_progress": 0, "kglm": 1, "kgpt": 1, "kgpu": [0, 1], "kguaranteed_no_evict": 0, "khalf": 1, "kind": [4, 5, 7, 26, 99], "kinflight": 0, "king": 51, "kint32": [0, 1], "kint64": [0, 1], "kint8": [0, 1], "kinvalid": 1, "kispoint": 1, "kisunsign": 1, "kj": 82, "kjson": [0, 3], "kjson_schema": [0, 3], "kleader": [0, 2], "klength": 0, "klinear": 1, "klookahead": 0, "klookaheaddecod": 1, "kmamba": 1, "kmax_util": 0, "kmaxretentionprior": 0, "kmedusa": [0, 1], "kminretentionprior": 0, "kmla": 0, "kmlp_4h_to_h": 1, "kmlp_gate": 1, "kmlp_gate_up": 1, "kmlp_h_to_4h": 1, "kmlp_router": 1, "kmoe_4h_to_h": 1, "kmoe_gat": 1, "kmoe_h_to_4h": 1, "kmoe_rout": 1, "kmpi": 0, "knegativeinfin": 1, "knob": [0, 70, 81, 82], "knone": 1, "knoop": 1, "knot_finish": 0, "know": [6, 72, 81, 82], "knowledg": 64, "known": [5, 12, 16, 64, 67, 82, 91], "knumflag": 0, "kobj": 0, "kopt_profiles_split_point": 1, "korchestr": [0, 2], "kosmo": [91, 93], "kpage": 1, "kpin": 1, "kpinnedpool": 1, "kqueu": 0, "kread": 0, "krecurr": 1, "krecurrentgemma": 1, "kregex": [0, 3], "kstatic": 0, "kstatic_batch": 0, "kstop_word": 0, "kstructural_tag": 0, "ktimed_out": 0, "ktopk": 0, "ktopktopp": 0, "ktopp": 0, "ktrtpointertyp": 1, "kuint8": [0, 1], "kunderlyingtyp": 1, "kunish": 10, "kunknown": 0, "kunsign": 1, "kusebantoken": 0, "kusebanword": 0, "kuseexpliciteosstop": 0, "kusefrequencypenalti": 0, "kusemaxlengthstop": 0, "kuseminlength": 0, "kuseminp": 0, "kusenorepeatngrams": 0, "kuseoccurrencepenalti": 0, "kusepenalti": 0, "kusepresencepenalti": 0, "kuserepetitionpenalti": 0, "kusestandardstopcriteria": 0, "kusestopword": 0, "kusetemperatur": 0, "kusevariablebeamwidthsearch": 0, "kuvm": [0, 1], "kv": [0, 1, 2, 3, 10, 16, 19, 21, 25, 27, 28, 29, 30, 36, 40, 41, 43, 44, 54, 64, 68, 70, 71, 73, 74, 75, 79, 82, 87, 88, 93, 94, 95, 96, 97, 99], "kv_b_proj": 82, "kv_cach": 0, "kv_cache_block_offset": [82, 83, 87, 92], "kv_cache_block_point": 92, "kv_cache_config": [30, 36, 43, 44, 49, 51, 53, 54, 70, 81, 98], "kv_cache_dtyp": [51, 70, 73, 77, 86, 98], "kv_cache_enable_block_reus": [87, 93], "kv_cache_free_gpu_mem_fract": [20, 74, 81], "kv_cache_free_gpu_memory_fract": [30, 37, 87, 93], "kv_cache_host_memory_byt": 9, "kv_cache_manag": [0, 93, 96, 97, 98, 99], "kv_cache_param": [83, 84, 97], "kv_cache_quant_algo": [15, 59, 70, 73, 77], "kv_cache_quant_mod": [5, 82], "kv_cache_retention_config": 70, "kv_cache_scaling_factor": [5, 15], "kv_cache_typ": [16, 29, 70, 87, 93], "kv_dtype": 84, "kv_event": 51, "kv_head": 83, "kv_host_cache_byt": 9, "kv_lora_rank": [82, 83], "kv_orig_quant_scal": 82, "kv_quant_orig_scal": 82, "kvalue_status_load": 1, "kvalue_status_miss": 1, "kvalue_status_process": 1, "kvcach": [0, 26, 43, 44, 54, 93], "kvcacheblock": 8, "kvcacheblockpool": 8, "kvcacheconfig": [0, 5, 9, 36, 43, 44, 49, 51, 53, 54, 70, 81, 89], "kvcachecreateddata": [0, 70], "kvcacheev": 0, "kvcacheeventdata": 0, "kvcacheeventdiff": 0, "kvcacheeventmanag": [0, 64], "kvcachehitr": 0, "kvcachehitrateperrequest": 0, "kvcacheindex": 1, "kvcachemanag": [0, 5, 9, 87, 97, 98], "kvcachemetr": 0, "kvcacheparam": 97, "kvcacheremoveddata": [0, 70], "kvcacheretentionconfig": [0, 70], "kvcaches": 0, "kvcachestat": [0, 30], "kvcachestoredblockdata": 0, "kvcachestoreddata": [0, 70], "kvcachetransferend": 0, "kvcachetransferm": 0, "kvcachetransfermod": [0, 70], "kvcachetransferstart": 0, "kvcachetyp": [1, 70, 87], "kvcachetypefromstr": 1, "kvcacheupdateddata": [0, 70], "kvfactor": 0, "kvheadnum": 82, "kvram": 0, "kwarg": [17, 19, 70, 82, 83, 84, 87, 93, 95], "kwrite": 0, "kxgrammar": 0, "l": [12, 55, 56, 57, 73, 91], "l2": 29, "l20": 29, "l304": 26, "l345": 26, "l4": 29, "l40": 29, "l440": 26, "l506": 26, "l546": 26, "l823": 26, "lab": 73, "label": [7, 82, 83, 84], "labelembed": 83, "lack": 0, "lai": 27, "lambda": [0, 3], "lamportinitializeal": 1, "languag": [0, 6, 12, 16, 18, 21, 26, 71, 72, 82, 90, 91, 93, 96], "language_adapt": [87, 93], "language_adapter_config": 87, "language_adapter_rout": [84, 87], "language_adapter_uid": 87, "language_model": 17, "languageadapterconfig": 87, "languageadapteruid": 0, "larg": [5, 9, 11, 12, 16, 18, 19, 20, 21, 25, 26, 28, 29, 34, 52, 61, 71, 72, 73, 76, 77, 79, 80, 82, 89, 91, 92, 93, 96], "larger": [0, 2, 5, 6, 9, 12, 13, 20, 22, 23, 25, 28, 54, 70, 73, 74, 82, 87, 89, 93], "largest": [6, 21, 22, 23, 82], "last": [0, 1, 3, 5, 10, 11, 12, 14, 26, 27, 70, 79, 81, 82, 84], "last_lay": 87, "last_process_for_ub": 82, "last_token_id": [82, 84, 92], "last_token_ids_for_logit": 84, "last_tokens_id": 82, "lastdraftindic": 1, "lastdraftlen": 1, "lastdraftpath": 1, "lastdrafttoken": 1, "lastgenerationlength": 1, "lastit": 0, "lastpositionidsbas": 1, "lasttokentim": 0, "late": 58, "latenc": [0, 5, 9, 12, 22, 23, 25, 27, 28, 29, 64, 70, 74, 79, 80, 81, 82, 93], "latent": [28, 83, 84], "later": [0, 1, 6, 10, 12, 16, 19, 23, 46, 49, 69, 77, 80, 87, 89, 92, 94], "latest": [0, 28, 31, 65, 88, 93], "latter": [3, 25, 93], "launch": [2, 9, 16, 28, 30, 52, 55, 56, 57, 64, 67, 69, 75, 92, 93, 94], "launch_llama_3": 16, "layer": [0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 15, 16, 17, 27, 29, 70, 76, 82, 87, 88, 89, 90, 92, 93, 95, 97, 98], "layer1": 10, "layer_idx": [10, 14, 82, 87, 95, 97], "layer_names_onli": [29, 70], "layer_norm": [82, 83], "layer_quant_mod": 70, "layer_typ": 87, "layerid": [1, 10], "layeridx": 1, "layernorm": [14, 29, 80, 82, 83, 93], "layernorm_shar": 83, "layernorm_typ": 83, "layernormpositiontyp": 82, "layernormtyp": [82, 83], "layertyp": [1, 7], "layerwis": 70, "layout": [79, 93], "lead": [7, 9, 12, 16, 29, 58, 65, 73, 74, 75, 77, 79, 80], "leader": [0, 87], "learn": [22, 23, 25, 42, 46, 47, 49, 50, 52, 77, 82, 88], "learned_absolut": [15, 82, 83, 84], "least": [0, 3, 5, 19, 20, 30, 58, 79, 87], "leav": [59, 79, 80, 81], "left": [70, 74, 79, 81, 82], "legaci": [17, 81, 85, 93], "len": [0, 1, 73, 82, 87, 99], "length": [0, 1, 5, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 53, 70, 73, 74, 75, 78, 80, 81, 82, 87, 89, 92, 93, 97, 98], "length_penalti": [6, 70, 87], "lengthlengthpenalti": 6, "lengthpenalti": [0, 1, 6], "less": [0, 3, 5, 6, 16, 22, 27, 70, 74, 82], "let": [7, 14, 15, 17, 26, 31, 36, 71, 73, 79, 82], "letter": 82, "level": [0, 1, 3, 5, 8, 10, 14, 15, 17, 19, 27, 28, 29, 30, 49, 69, 72, 73, 89, 93, 95], "leverag": [12, 21, 26, 27, 77, 88], "lf": [10, 20, 65, 69], "lfz941": 93, "lh": 1, "lib": [19, 67, 73], "libnam": 0, "libnvinfer_plugin_tensorrt_llm": 65, "libopenmpi": [66, 67], "librari": [16, 18, 65, 69, 71, 92, 93, 97], "libtensorrt_llm": 65, "licens": [69, 88], "life": 58, "lifecycl": 8, "lightweight": 5, "like": [0, 3, 5, 6, 7, 9, 12, 15, 16, 18, 19, 25, 26, 27, 28, 29, 36, 42, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 70, 71, 73, 75, 76, 77, 79, 80, 81, 82, 88, 89, 90, 92, 93, 94, 95, 96, 98], "likelihood": [4, 9, 12], "limit": [0, 2, 3, 5, 6, 7, 16, 19, 20, 25, 26, 27, 28, 36, 67, 69, 70, 71, 75, 79, 81, 82, 85, 87, 89, 91, 97], "lin": 21, "line": [9, 20, 25, 73, 75, 77, 80, 89, 93, 98, 99], "linear": [1, 10, 12, 14, 15, 16, 28, 82, 89, 90, 93, 95, 97], "linearactiv": 83, "linearapproximategelu": 83, "linearbas": 83, "lineargeglu": 83, "lineargelu": 83, "linearli": 89, "linearswiglu": 83, "link": [9, 20, 26, 31, 32, 93], "linspac": 82, "linux": [64, 91, 93], "linux_x86_64": 65, "list": [0, 1, 3, 5, 6, 7, 15, 16, 17, 18, 27, 36, 52, 65, 68, 70, 71, 73, 74, 75, 82, 83, 84, 87, 91, 92, 93, 97, 98, 99], "list_siz": 83, "liter": 70, "littl": [27, 80], "live": 89, "livecodebench": 26, "lkm2835": 93, "ll": [25, 30], "llama": [6, 10, 12, 13, 17, 19, 22, 23, 25, 29, 46, 54, 69, 75, 76, 78, 79, 81, 88, 90, 91, 93, 94, 95], "llama2": [5, 10, 21, 22, 93], "llama3": 82, "llama4": 70, "llama4forconditionalgener": 91, "llama_13b": 23, "llama_70b": 23, "llama_7b": [10, 13], "llama_7b_with_lora_qkv": 10, "llama_model_path": 36, "llamaconfig": [84, 95], "llamaforcausallm": [17, 19, 84, 91], "llamamodel": 84, "llava": [17, 90, 91, 93], "llava_dict": 17, "llavallamamodel": 91, "llavanextforconditionalgener": 91, "llavanextvisionconfig": 84, "llavanextvisionwrapp": 84, "llm": [0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 14, 16, 21, 24, 26, 28, 29, 30, 33, 34, 35, 37, 38, 39, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 58, 59, 60, 61, 62, 66, 67, 68, 70, 72, 74, 76, 77, 78, 80, 81, 82, 84, 86, 87, 90, 92, 94, 95, 96, 97, 98, 99], "llm_arg": [70, 74], "llm_engine_dir": 87, "llm_inference_distribut": 69, "llm_kwarg": [43, 44, 54], "llm_mgmn_": 93, "llm_option": 74, "llm_ptq": 94, "llmapi": [3, 30, 36, 43, 44, 45, 49, 51, 53, 54, 55, 56, 57, 59, 70, 74, 77, 93], "llmarg": [70, 74, 93], "llmrequest": [1, 98, 99], "llmrequestptr": 1, "llmrequestst": 99, "lm": 12, "lm_head": [14, 17, 54, 73, 93], "lmm": [6, 73], "lmsy": [43, 44, 54], "ln_emb": 17, "ln_f": [14, 17], "load": [0, 1, 10, 14, 15, 16, 19, 24, 26, 28, 29, 46, 49, 54, 67, 69, 70, 73, 74, 75, 80, 81, 84, 86, 87, 88, 89, 93], "load_format": 70, "load_model_on_cpu": 84, "load_tensor": 17, "load_test_audio": 87, "load_test_data": 87, "load_weight": 95, "loaded_weight": 83, "loader": 93, "loadformat": 70, "loadinprogress": 1, "loadremoteag": 0, "loadweight": 1, "local": [15, 16, 20, 26, 29, 46, 47, 48, 49, 50, 55, 56, 57, 59, 65, 67, 70, 73, 74, 77, 93, 98], "local_in_featur": 83, "local_layer_idx": 83, "local_model": [55, 56, 57], "local_out_featur": 83, "local_us": [20, 65, 88], "localhost": [30, 33, 34, 35, 37, 38, 39, 60, 61, 62, 88], "localinadapters": 1, "localindim": 1, "localinouts": 1, "localins": 1, "localoutadapters": 1, "localoutdim": 1, "localouts": 1, "localreduct": 26, "localscaless": 1, "localtotals": 1, "locat": [6, 7, 16, 28, 59, 65, 73, 74, 82, 88, 92, 97], "locate_accepted_draft_token": 87, "lock": [67, 73], "lockstep": 0, "log": [0, 1, 5, 8, 29, 30, 31, 55, 56, 57, 59, 70, 73, 82, 88, 89, 93], "log_level": [29, 30], "log_softmax": 82, "logic": [3, 8, 17, 19, 52, 83, 93, 95, 96, 99], "login": [31, 88], "logit": [0, 1, 6, 12, 26, 27, 40, 41, 70, 73, 82, 84, 87, 92, 93], "logits_dtyp": [15, 29, 84], "logits_processor": [52, 70, 87], "logits_processor_map": 87, "logits_processor_nam": 87, "logitspostprocessor": 0, "logitspostprocessorbatch": [0, 3], "logitspostprocessorconfig": [0, 3, 93], "logitspostprocessormap": 0, "logitspostprocessornam": 0, "logitsprocessor": [52, 70, 87, 93], "logitsprocessorlist": 87, "logitsvec": 1, "logn": [82, 93], "logn_scal": 82, "logprob": [0, 1, 36, 53, 70, 88], "logprobs_diff": 70, "logprobscba": 1, "logprobstil": 1, "london": 92, "long": [5, 25, 29, 72, 73, 75, 76, 77, 79, 80, 89, 93], "long_mscal": [82, 83], "long_rop": 82, "long_rope_embed_posit": 83, "long_rope_embed_positions_for_gpt_attent": 83, "long_rope_rotary_cos_sin": 82, "long_rope_rotary_inv_freq": [82, 83], "longer": [0, 6, 9, 26, 28, 70, 74, 79, 82, 99], "longest": [2, 27, 79, 82], "longrop": 82, "longtensor": [87, 95], "look": [0, 3, 19, 24, 65, 71, 73, 93], "lookahead": [0, 1, 40, 41, 64, 70, 93], "lookahead_config": [53, 70, 87], "lookahead_decod": [29, 84], "lookaheadconfig": 0, "lookaheaddecod": 1, "lookaheaddecodingbuff": 1, "lookaheaddecodingconfig": [0, 1, 53, 70], "lookaheadinput": 1, "lookaheadoutput": 1, "lookaheadruntimebuff": 1, "lookaheadruntimeconfig": 1, "lookup": [64, 82, 83, 93], "lookup_plugin": 82, "loop": [0, 3, 6, 16, 17, 70, 81], "lopuhin": 93, "lora": [0, 1, 3, 40, 41, 64, 68, 70, 82, 83, 84, 87, 93], "lora_ckpt_sourc": [29, 87], "lora_config": [58, 70, 84], "lora_dir": [10, 29, 58, 87], "lora_dir1": 58, "lora_dir2": 58, "lora_dir3": 58, "lora_hidden_st": 83, "lora_layer_param": 83, "lora_manag": [58, 70, 87, 93], "lora_param": 84, "lora_plugin": [10, 29, 82, 87], "lora_rank": [10, 82], "lora_request": [58, 70], "lora_runtime_param": 83, "lora_target_modul": [10, 29, 84, 87], "lora_task_uid": 87, "lora_uid": 87, "lora_weights_point": 82, "loracachefullexcept": 1, "loracachepagemanag": 1, "loraconfig": [0, 10, 58, 70, 84, 93], "loraexpectedexcept": 1, "loraid": 0, "loramanag": 87, "loramodulenam": 1, "loraparam": 84, "loraprefetchdir": 0, "lorarequest": [58, 70], "loraruntimeparam": 83, "lorataskidtyp": [0, 1], "loraweight": 10, "loss": [25, 77], "lot": [5, 9, 16, 18, 27], "loudspeak": 23, "lovelac": [71, 91, 93], "low": [5, 14, 19, 20, 25, 26, 27, 28, 29, 64, 82, 93], "low_latency_gemm": 82, "low_latency_gemm_plugin": [29, 73, 77, 83], "low_latency_gemm_swiglu": 82, "low_latency_gemm_swiglu_plugin": [29, 77, 85], "low_rank": 82, "lower": [0, 1, 2, 6, 7, 9, 10, 24, 25, 28, 49, 70, 74, 77, 82, 89], "lowprecis": [11, 82], "lru": [1, 9, 82], "lt": 82, "luotuo": 10, "m": [0, 20, 22, 26, 30, 38, 39, 45, 58, 73, 74, 75, 77, 79, 80, 82, 89, 90], "macceptancethreshold": 0, "machin": [9, 20, 25, 52, 93], "madditionalmodeloutput": 0, "maddr": 0, "made": [52, 71, 93, 99], "magentnam": 0, "mahmoudashraf97": 93, "mai": [0, 1, 2, 3, 5, 9, 10, 11, 12, 15, 16, 17, 19, 20, 26, 27, 29, 31, 55, 56, 57, 65, 67, 69, 72, 73, 74, 75, 80, 81, 82, 83, 85, 89, 92, 93, 95, 96, 97, 98], "main": [3, 6, 8, 21, 24, 26, 27, 34, 36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 58, 59, 61, 66, 67, 69, 70, 72, 75, 77, 80, 81, 82, 88, 89, 92, 94, 95], "mainli": 27, "maintain": [2, 10, 21, 22, 25, 73, 77, 90], "major": [19, 26, 59, 71, 74, 89], "make": [1, 2, 5, 7, 10, 16, 19, 20, 25, 26, 27, 31, 32, 53, 58, 64, 65, 71, 73, 75, 81, 82, 88, 92, 93], "make_causal_mask": 83, "makeshap": 1, "maketransferag": 0, "mallotedtim": 0, "mallreducecommptr": 1, "mamba": [29, 69, 82, 90, 91, 93], "mamba1": 82, "mamba2": [82, 93], "mamba_conv1d": 82, "mamba_conv1d_plugin": [29, 87], "mamba_vers": 82, "mambaconfig": 84, "mambaforcausallm": 84, "manag": [0, 1, 2, 5, 12, 16, 28, 29, 36, 64, 67, 69, 75, 81, 85, 87, 88, 89, 93, 94, 96, 97], "managedweight": 0, "managedweightsmap": 1, "manageweightstyp": 1, "manageweighttyp": 1, "mandatori": [1, 3, 15], "mani": [0, 5, 8, 9, 12, 16, 19, 27, 28, 29, 32, 59, 70, 74, 77, 79, 81, 82, 91, 92], "manipul": 7, "manner": 7, "mantissa": 22, "manual": [28, 36, 70, 87, 92], "manufactur": 73, "map": [0, 1, 2, 3, 5, 7, 11, 14, 15, 16, 17, 19, 26, 74, 82, 83, 84, 87, 88, 98], "marcellu": 51, "mard1no": 93, "margin": [73, 79], "mark": [1, 7, 79, 82, 92], "mark_as_remov": 7, "mark_output": [3, 82], "markalldon": 1, "markdon": 1, "marker": 70, "marks101": 93, "marktaskdon": 1, "mask": [0, 1, 5, 12, 26, 27, 52, 82, 83, 84, 87, 97], "mask_typ": 82, "masked_scatt": 82, "masked_scatter_": 82, "masked_select": [82, 93], "massiv": 20, "master": [76, 77, 78], "mat2": 82, "match": [0, 4, 7, 12, 27, 64, 70, 73, 82, 83, 87, 88, 92, 93], "match_and_rewrit": 7, "materi": 3, "math": [26, 28, 91], "matichon": 93, "matmul": [5, 16, 29, 77, 82, 90], "matric": 4, "matrix": [5, 16, 24, 28, 64, 71, 73, 76, 82, 88, 97], "mattentionconfig": 0, "mattentiontyp": 0, "matter": 9, "matur": 30, "max": [0, 1, 10, 21, 22, 23, 28, 64, 70, 75, 77, 78, 80, 82, 87, 89, 92, 97], "max_all_reduce_block": 1, "max_attention_window": [70, 81, 93], "max_attention_window_s": [5, 81, 82, 87], "max_attn_valu": 83, "max_batch_s": [5, 10, 13, 15, 16, 19, 20, 27, 29, 30, 36, 37, 43, 44, 49, 53, 54, 70, 73, 77, 79, 80, 82, 84, 87, 89, 92, 93, 98], "max_beam_width": [3, 5, 29, 30, 36, 49, 70, 82, 84, 87, 89], "max_block": [82, 99], "max_blocks_per_seq": 87, "max_blocks_per_sequ": 82, "max_boost_slid": 73, "max_cache_storage_gb": 70, "max_context_length": [82, 83, 87, 89], "max_cpu_lora": 70, "max_decoder_input_len": 84, "max_decoder_seq_len": 29, "max_dist": [5, 82, 83], "max_draft_len": [29, 43, 44, 54, 70, 84, 86], "max_draft_token": [84, 87], "max_encoder_input_len": [29, 70, 84], "max_gen_token": 84, "max_input_len": [10, 13, 15, 16, 29, 70, 73, 84, 87, 89], "max_input_length": [82, 83, 84, 87], "max_kv_seqlen": 82, "max_lora": 70, "max_lora_rank": [10, 29, 58, 70], "max_low_rank": 82, "max_matching_ngram_s": 70, "max_medusa_token": 87, "max_multimodal_len": 29, "max_new_token": [87, 89], "max_ngram_s": [53, 70], "max_non_leaves_per_lay": [43, 44, 70], "max_num_request": [97, 98, 99], "max_num_token": [20, 29, 30, 36, 37, 49, 70, 73, 77, 79, 80, 84, 89, 93, 97], "max_output_len": [16, 87, 88, 92, 93], "max_period": 83, "max_position_embed": [15, 82, 83, 84], "max_position_embedding_len": 82, "max_power_limit": 73, "max_prompt_adapter_token": 70, "max_prompt_embedding_table_s": [29, 70, 87, 93], "max_record": 70, "max_seq_len": [10, 13, 15, 16, 29, 30, 43, 44, 54, 70, 73, 81, 82, 83, 84, 87, 89, 93, 98], "max_seqlen": [5, 82], "max_seqlen_for_logn_sc": 83, "max_sequence_length": [5, 87], "max_token": [30, 33, 34, 35, 45, 51, 60, 61, 62, 70, 81, 88, 94], "max_tokens_in_paged_kv_cach": [81, 87, 93], "max_util": [0, 70, 81], "max_verification_set_s": [53, 70], "max_window_s": [53, 70], "maxaccepteddrafttokensperstep": 1, "maxacceptedtoken": 1, "maxadapters": 0, "maxattentionwindow": 1, "maxattentionwindowvec": [0, 1], "maxbadwordslen": 1, "maxbatchs": [0, 1, 6], "maxbatchsizeruntim": 0, "maxbatchsizeruntimeupperbound": 0, "maxbatchsizestat": 0, "maxbatchsizetunerrecommend": 0, "maxbeamwidth": [0, 1, 3, 93], "maxdecoderstep": 1, "maxdecodingdrafttoken": 1, "maxdecodingtoken": [0, 1], "maxdraftpathlen": [0, 1], "maxdrafttoken": [0, 1], "maxencoderlen": 1, "maxgenerationlength": 1, "maxgenlengthdevic": 1, "maxgenlengthhost": 1, "maxgentoken": 1, "maxim": [0, 21, 23, 26, 28, 73, 81], "maximum": [0, 1, 2, 3, 5, 6, 20, 23, 29, 30, 70, 73, 74, 77, 82, 83, 87, 89, 92, 93, 98], "maxinputlen": [1, 6], "maxinputlength": 1, "maxlength": 1, "maxlengthstop": 0, "maxlorarank": 1, "maxmedusahead": 1, "maxnewtoken": [1, 93], "maxnonleafnodesperlay": 1, "maxnumactiverequest": 0, "maxnumblock": 0, "maxnumpath": 1, "maxnumsequ": [1, 93], "maxnumtoken": [0, 1], "maxnumtokensruntim": 0, "maxnumtokensstat": 0, "maxnumtokenstunerrecommend": 0, "maxoutputlength": 3, "maxpagesperblock": 1, "maxpagesperblockdevic": 0, "maxpagesperblockhost": 0, "maxpathdraftlen": 1, "maxpathlen": [0, 1], "maxpositionembed": [0, 1], "maxpromptembeddingtables": 1, "maxqueues": 0, "maxseqidlemicrosecond": 0, "maxseqlen": 1, "maxsequencelen": [1, 6], "maxsequencelength": 1, "maxstopwordslen": 1, "maxtoken": [0, 89, 93], "maxtokensperenginestep": 1, "maxtokensperstep": 1, "mb": [70, 89], "mbackend": 0, "mbackendagentdesc": 0, "mbart": [91, 93], "mbatchingtyp": 0, "mbatchsizet": 0, "mbeamsearchbuff": 1, "mbeamsearchdiversityr": 0, "mbeamwidth": 0, "mbeamwidtharrai": 0, "mbp": 45, "mbuffer": 1, "mbuffermanag": 1, "mc_handl": 1, "mc_ptr": 1, "mc_va": 1, "mcachemap": 1, "mcachemutex": 1, "mcachepagemanag": 1, "mcachest": 0, "mcachetransceiverconfig": 0, "mcapacityschedulerpolici": 0, "mcommmod": 0, "mcommptr": 1, "mcommstat": 0, "mcommtyp": 0, "mcomputecontextlogit": 1, "mcomputegenerationlogit": 1, "mconfig": [0, 1], "mconnectioninfo": 0, "mcontextchunkingpolici": 0, "mcontextfmha": 1, "mcontextparallel": 1, "mcopyonpartialreus": 0, "mcpu": 1, "mcpudiff": 1, "mcrosskvcachefract": 0, "mcudagraphcaches": 0, "mcudagraphmod": 0, "mcumlogprobstmp": 1, "md": [2, 12, 14, 26, 82, 93, 96], "mdatatyp": [0, 1], "mdebugconfig": 0, "mdebuginputtensor": 0, "mdebugoutputtensor": 0, "mdebugtensornam": 0, "mdebugtensorsmaxiter": 0, "mdecod": 1, "mdecodedurationm": 0, "mdecoderetentionprior": 0, "mdecoderst": 1, "mdecoderstream": 1, "mdecodingconfig": 0, "mdecodinglayerworkspac": 1, "mdecodingmod": [0, 1], "mdefaulteaglechoic": 1, "mdefaultmedusachoic": 1, "mdefaultposteriorthreshold": 1, "mdesc": 0, "mdevic": 1, "mdevicebuffermanag": 1, "mdevicecacheperc": 0, "mdeviceid": [0, 1], "mdirectori": 0, "mdllmutex": 0, "mdogreedysampl": 1, "mdonetask": 1, "mdprank": 0, "mdpsize": 0, "mdrafttoken": 0, "mdstdesc": 0, "mdynamicbatchconfig": 0, "mdynamicbatchmovingaveragewindow": 0, "mdynamicdecodelay": 1, "mdynamictreemaxtopk": 0, "me": [34, 58, 59, 61, 88], "meaglechoic": 0, "meagleconfig": 0, "mean": [1, 4, 5, 6, 9, 12, 15, 17, 19, 20, 22, 23, 27, 28, 30, 38, 39, 56, 58, 70, 72, 73, 74, 75, 76, 81, 82, 85, 87, 89], "meaning": [1, 28, 77, 80], "meant": 78, "mearlystop": 0, "measur": [0, 21, 23, 24, 25, 27, 28, 64, 73, 75, 93], "mechan": [3, 16, 98, 99], "media": [73, 93], "media_path": 73, "medium": [25, 92, 93], "medusa": [0, 1, 29, 40, 41, 64, 70, 82, 84, 87, 93], "medusa_choic": [12, 54, 70, 73, 87], "medusa_decode_and_verifi": 87, "medusa_hidden_act": 86, "medusa_logit": 87, "medusa_model_dir": 86, "medusa_output_token": 87, "medusa_path": 87, "medusa_position_offset": 87, "medusa_temperatur": [12, 87], "medusa_topk": 87, "medusa_tree_id": 87, "medusachoic": [0, 1], "medusaconfig": 84, "medusacurtokensperstep": 1, "medusadecodingconfig": [54, 70], "medusaforcausallm": 84, "medusainput": 1, "medusalogit": 1, "medusapath": 1, "medusatargettokensperstep": 1, "medusatreeid": 1, "meet": [25, 82], "membeddingt": 0, "member": [0, 1, 6, 7, 13, 16, 59, 82], "memlock": [65, 92], "memori": [0, 1, 2, 4, 5, 6, 8, 10, 16, 17, 19, 21, 22, 24, 25, 26, 27, 28, 29, 30, 36, 52, 64, 70, 73, 74, 75, 79, 80, 82, 87, 92, 93, 97, 98], "memorydesc": 0, "memorypoolfre": [1, 89], "memorypoolreserv": [1, 89], "memorypooltrimto": 1, "memorypoolus": 1, "memorytyp": [0, 1], "memorytypestr": 1, "memtyp": 1, "memusagechang": 89, "menableattentiondp": [0, 1], "menablebatchsizetun": 0, "menableblockreus": 0, "menablechunkedcontext": 0, "menablecontextfmhafp32acc": 0, "menablemaxnumtokenstun": 0, "menablepartialreus": 0, "menabletrtoverlap": 0, "mencodedvocab": 0, "mencoderhiddens": 1, "mengineaddr": 1, "menginebuff": 1, "menginepath": 1, "mengines": 1, "mental": 58, "mention": [6, 19, 20, 36, 77], "menu": [31, 32], "merg": [26, 82], "meshgrid": 82, "meshgrid2d": 82, "messag": [11, 26, 30, 33, 34, 60, 61, 67, 70, 74, 82, 88, 89, 93], "met": [0, 1, 3, 12], "meta": [19, 69, 70, 73, 74, 75, 81, 88, 91], "meta_ckpt_dir": 84, "metadata": [8, 95, 97], "metal": [93, 94], "meth": 69, "method": [0, 1, 3, 5, 6, 12, 13, 15, 16, 19, 21, 27, 28, 36, 52, 67, 70, 73, 87, 90, 92, 93, 95, 96, 98, 99], "metric": [0, 28, 70, 72, 73, 74, 75, 77, 79, 80, 93], "mevent": 1, "meventbuffermaxs": 0, "mexecutionconfig": 1, "mextendedruntimeperfknobconfig": 0, "mfastlogit": 0, "mfinishedstep": 1, "mfirstgentoken": 0, "mflagptr": 1, "mfreegpumemoryfract": 0, "mfreepageid": 1, "mfrequencypenalti": 0, "mfuntowicz": 93, "mgathergenerationlogit": 0, "mgemmallreducedtyp": 1, "mgmn": [40, 41], "mgpu": 1, "mgpudiff": 1, "mgpuspernod": 1, "mgpuweightsperc": 0, "mgreedysampl": 0, "mguid": 0, "mguideddecodingconfig": 0, "mguidetyp": 0, "mh": 12, "mh1": 12, "mha": [5, 8, 21, 28, 29, 82, 87, 97], "mhandler": 0, "mhiddens": 1, "mhostcaches": 0, "mi": 90, "mib": 89, "micro": [0, 89], "microbatchid": 0, "microbatchschedul": [96, 99], "microsecond": 0, "microsoft": 15, "middl": 72, "might": [0, 3, 16, 19, 20, 25, 29, 65, 69, 71, 73, 75, 76, 80, 87, 89, 92, 93, 98], "migrat": [19, 85, 93], "million": [59, 73], "millisecond": 0, "millisecondstyp": 0, "mimpl": 0, "min": [0, 1, 6, 22, 26, 27, 28, 70, 73, 75, 80, 82, 92], "min_lat": 82, "min_length": [6, 87], "min_p": [0, 6, 70, 87], "min_token": 70, "mind": [25, 81], "mindim": 1, "mindimfirst": 1, "mini": 93, "minim": [26, 79, 88], "minimum": [0, 5, 6, 70, 73, 74, 77, 82, 87, 89], "minitron": [91, 93], "minittozero": 1, "minlength": [1, 6, 93], "minnormedscorescba": 1, "minor": [59, 93], "minp": [0, 1, 6], "minprogresstask": 1, "minputpack": 1, "minputtokenextraid": 0, "mintoken": [0, 93], "mintpsplitdim": 1, "minut": [0, 25, 75], "mip": 0, "mipcmemoryhandl": 1, "mirco": 0, "mish": 83, "mismatch": [19, 67, 92], "misorchestr": 0, "mispagefre": 1, "miss": [0, 7, 20, 73, 93], "missedblock": 0, "missedblocksperrequest": 0, "mission": 26, "mistral": [4, 69, 73, 77, 80, 90, 91, 93], "mistralai": [73, 91], "mistralforcausallm": 91, "misus": 93, "miterstatsmaxiter": 0, "mitig": [19, 26], "mix": [2, 28, 76, 80, 93], "mixed_precis": 70, "mixed_sampl": 70, "mixer": 93, "mixtral": [4, 10, 69, 73, 77, 80, 90, 91, 93], "mixtralforcausallm": 91, "mixtur": [28, 64, 80, 93], "mjointdecodinginput": 1, "mjointdecodingoutput": 1, "mkdir": 31, "mkdtemp": [46, 49], "mkvcacheconfig": 0, "mkvcachetyp": 1, "mkvfactor": 0, "ml": [82, 93], "mla": [26, 27, 82, 93], "mlayertyp": 1, "mlen": 0, "mlengthpenalti": 0, "mllama": [91, 93], "mllamaconfig": 84, "mllamaforcausallm": 84, "mllamaforconditionalgener": 91, "mlogit": 0, "mlogitsdtyp": 1, "mlogitspostprocessorconfig": 0, "mlookaheaddecodingconfig": 0, "mlookaheaddecodingmaxnumrequest": 0, "mloramodul": 1, "mloraprefetchdir": 0, "mlp": [10, 14, 16, 17, 29, 82, 92, 93, 95], "mlp_4h_to_h": [10, 29], "mlp_bia": 84, "mlp_gate": [10, 29], "mlp_gate_up": [10, 29], "mlp_h_to_4h": [10, 29], "mlp_output": 92, "mlp_router": [10, 29], "mlphiddens": 1, "mlptype": 82, "mm": 93, "mm_data": 73, "mm_embedding_offload": 87, "mma": [28, 82], "mmanag": 1, "mmanagedweightsmap": 1, "mmanageweightstyp": 1, "mmaxadapters": 0, "mmaxattentionwindow": 0, "mmaxattentionwindowvec": 0, "mmaxbatchs": [0, 1], "mmaxbeamwidth": [0, 1], "mmaxdecodingdecodertoken": 1, "mmaxdecodingdrafttoken": 1, "mmaxdecodingenginetoken": 1, "mmaxdraftpathlen": 1, "mmaxencoderlen": 1, "mmaxinputlen": 1, "mmaxlorarank": 1, "mmaxnonleafnodesperlay": 1, "mmaxnumpackedmask": 1, "mmaxnumpath": 1, "mmaxnumtoken": [0, 1], "mmaxpagesperblock": 1, "mmaxpagesperblockdevic": 0, "mmaxpagesperblockhost": 0, "mmaxpositionembed": 1, "mmaxpromptembeddingtables": 1, "mmaxqueues": 0, "mmaxseqidlemicrosecond": 0, "mmaxsequencelen": 1, "mmaxsequencelength": 1, "mmaxtoken": 0, "mmedusachoic": 0, "mmemorytyp": 1, "mmha": [82, 93], "mminp": 0, "mmintoken": 0, "mmlphiddens": 1, "mmlu": [25, 26, 93], "mmlu_llmapi": 93, "mmmu": 73, "mmodelconfig": [0, 1], "mmodelnam": 1, "mmodelvari": 1, "mmoduleidtomodul": 1, "mmropepositiondelta": 0, "mmroperotarycossin": 0, "mmultiblockmod": 0, "mname": [0, 1], "mnbattentionlay": 1, "mnbhead": 1, "mnbkvheadsperlay": 0, "mnblayer": 1, "mnbrnnlayer": 1, "mngramsiz": 0, "mnorepeatngrams": 0, "mnormalizelogprob": 0, "mnumcopystream": [0, 1], "mnumdecodingenginetoken": 1, "mnumdevicemodulelay": 0, "mnumensurework": 0, "mnumhostmodulelay": 0, "mnumkvheadsperattentionlay": 1, "mnumkvheadspercrossattentionlay": 1, "mnumlanguag": 1, "mnumnod": 0, "mnumputwork": 0, "mnumreturnbeam": 0, "mnumreturnsequ": 0, "mnumsm": 1, "mnumtransformerslay": 1, "modal": 90, "mode": [0, 1, 4, 5, 7, 16, 17, 28, 29, 30, 45, 55, 56, 57, 70, 81, 82, 83, 87, 89, 90, 93, 95], "model": [0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 13, 15, 19, 21, 22, 23, 24, 25, 28, 29, 30, 33, 34, 35, 36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 66, 67, 70, 71, 72, 75, 78, 81, 82, 83, 85, 86, 87, 89, 90, 94, 97, 98, 99], "model_architectur": 70, "model_cl": 83, "model_cls_fil": 29, "model_cls_nam": 29, "model_config": [29, 70, 87, 95], "model_construct": 70, "model_dir": [10, 13, 14, 15, 16, 17, 19, 27, 54, 55, 73, 76, 84, 86, 88, 92], "model_engin": 98, "model_nam": [56, 74, 87], "model_path": [13, 56, 72, 73], "model_post_init": 70, "model_qu": 73, "model_weights_load": [17, 93], "modelconfig": [0, 6, 87, 93, 95], "modelengin": [96, 98], "modelidtomodel": 1, "modeling_deepseekv3": [26, 28], "modeling_llama": 95, "modeling_mymodel": 95, "modeling_opt": 95, "modeling_util": [70, 95], "modelnam": 1, "modelopt": [15, 19, 54, 67, 73, 74, 86, 93], "modelopt_cuda_ext": 67, "modelpath": 0, "modelrunn": [15, 87, 93], "modelrunnercpp": [87, 93], "modelrunnermixin": 87, "modeltyp": [0, 13], "modelvari": 1, "modelweightsformat": 17, "modelweightsload": [17, 93], "modern": 87, "modif": [7, 16], "modifi": [3, 7, 65, 73, 77, 80, 81, 92, 93], "modul": [0, 1, 5, 6, 14, 15, 16, 17, 26, 29, 64, 65, 70, 80, 82, 83, 84, 86, 87, 92, 93, 95], "modular": 71, "module1": 26, "module10": 26, "module11": 26, "module12": 26, "module13": 26, "module2": 26, "module3": 26, "module4": 26, "module5": 26, "module6": 26, "module7": 26, "module8": 26, "module9": 26, "module_id": 10, "moduleid": [1, 10], "moduleidtomodel": 1, "modulelist": 95, "moduletyp": 1, "modulo": 82, "moe": [10, 17, 26, 27, 29, 50, 64, 70, 80, 82, 84, 93], "moe_4h_to_h": [10, 29], "moe_allreduce_residual_rms_norm": 82, "moe_backend": [20, 27, 70], "moe_cluster_parallel_s": 70, "moe_ep_s": 4, "moe_expert_parallel_s": [50, 70], "moe_gat": [10, 29], "moe_h_to_4h": [10, 29], "moe_load_balanc": 70, "moe_max_num_token": 70, "moe_plugin": 29, "moe_rout": [10, 29], "moe_tensor_parallel_s": [50, 70], "moe_tp_siz": 4, "moeconfig": 84, "moetopk": 93, "moment": 3, "monboardblock": 0, "monitor": [8, 29], "monitor_memori": [29, 70], "monolith": 5, "monost": 0, "month": 73, "mop": 0, "mopenipc": 1, "moptimaladapters": 0, "morchestratorconfig": 0, "morchleadercomm": 0, "more": [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 14, 15, 16, 21, 22, 23, 25, 26, 27, 28, 29, 30, 36, 40, 51, 52, 59, 65, 70, 71, 73, 74, 75, 77, 79, 80, 81, 82, 88, 89, 92, 93, 95, 97, 99], "most": [0, 1, 6, 8, 12, 16, 19, 21, 22, 23, 25, 26, 28, 42, 46, 47, 49, 50, 70, 72, 78, 80, 81, 82, 89, 92, 93], "mount": [30, 55, 56, 57], "mount_dest": [55, 56, 57], "mount_dir": [55, 56, 57], "moutdim": 1, "moutdimfirst": 1, "moutputbeamhypothes": 1, "mouttpsplitdim": 1, "move": [0, 1, 8, 19, 52, 70, 71, 82, 92, 93], "movement": [8, 16], "mownsev": 1, "mownsstream": 1, "mp4": [34, 61], "mpageblock": 1, "mpagedcontextfmha": 1, "mpagedst": 1, "mpagemanagerconfig": 1, "mpagesmutex": 1, "mpagewidth": 1, "mparallelconfig": 0, "mparticipantid": 0, "mpeftcacheconfig": 0, "mpi": [0, 1, 2, 6, 16, 18, 19, 29, 30, 55, 56, 57, 67, 70, 72, 73, 75, 82, 92, 93], "mpi4pi": [69, 75, 92, 93], "mpi_abort": 69, "mpi_barri": 19, "mpi_comm_world": [6, 69], "mpi_group_barri": 1, "mpicomm": 0, "mpicommsess": 70, "mpin": 1, "mpinneddiff": 1, "mpinnedpool": 1, "mpinnedpooldiff": 1, "mpipelineparallel": [0, 1], "mpirun": [15, 16, 69, 75, 92, 93], "mpisess": 70, "mpistat": 0, "mpointer": 1, "mpool": 1, "mport": 0, "mposteriorthreshold": 0, "mppreducescatt": 1, "mprecis": 1, "mpresencepenalti": 0, "mprocessorbatch": 0, "mprocessormap": 0, "mprompttableoffload": 0, "mpt": [25, 90, 91, 93], "mptforcausallm": 84, "mptmodel": 84, "mqa": [5, 8, 21, 24, 26, 29, 82, 93, 97], "mquantmod": 1, "mrank": [0, 1], "mrecvpollperiodm": 0, "mremotenam": 0, "mrepetitionpenalti": 0, "mreplic": 0, "mreqid": 0, "mrequeststatsmaxiter": 0, "mrnnconfig": 1, "mrope": [0, 82], "mrope_param": [83, 87], "mrope_position_delta": [82, 83, 87], "mrope_rotary_cos_sin": [82, 83], "mrope_rotary_cos_sin_s": 84, "mropeconfig": 0, "mropeparam": [83, 87], "mropepositiondelta": 0, "mroperoratysinco": 0, "mrotaryembeddingdim": 1, "mruntimedefault": 1, "mruntimestream": 1, "msamplingconfig": 1, "mscale": 82, "mscale_all_dim": 82, "mschedulerconfig": 0, "msecondaryofflineminprior": [0, 70], "msecondaryoffloadminprior": 0, "mseed": 0, "mselfidx": 0, "msg": [0, 1, 26, 70], "msinktokenlength": 0, "msizeperhead": [0, 1], "mskipcrossattnblock": 1, "msl": 1, "mslotsperpag": 1, "mspawnprocess": 0, "mspeculativedecodingconfig": 0, "mspeculativedecodingmod": 1, "mspeculativedecodingmodul": 1, "msrcdesc": 0, "mstate": [0, 1], "mstoptokenid": 0, "mstream": 1, "msyncmessag": 0, "mt5": 91, "mtag": 0, "mtaskid": 0, "mtemperatur": 0, "mtensor": 0, "mtensorparallel": [0, 1], "mtoken": 0, "mtokenizerstr": 0, "mtokenrangeretentionconfig": 0, "mtokensperblock": [0, 1], "mtopk": 0, "mtopp": 0, "mtoppdecai": 0, "mtoppmin": 0, "mtoppresetid": 0, "mtotalnumpag": 1, "mtp": [20, 70, 93], "mtp3_autoregress": 26, "mtp3_top1": 26, "mtp3_top10": 26, "mtp3_top15": 26, "mtp3_vanilla": 26, "mtpdecodingconfig": 70, "mtprank": 1, "mtransfermod": 0, "mtrimpool": 1, "mtype": [0, 1], "much": [9, 16, 27, 72, 74, 79, 89], "mul": 82, "multi": [0, 2, 3, 4, 6, 9, 10, 12, 15, 18, 19, 21, 27, 28, 29, 34, 55, 56, 57, 61, 64, 65, 69, 70, 75, 82, 84, 89, 90, 93, 97], "multi_block_mod": [5, 70, 87, 93], "multiblockmod": 0, "multidimension": 82, "multihead": [16, 21], "multimod": [0, 29, 63, 73, 87, 91, 93], "multimodalembed": 0, "multimodalmodelrunn": 87, "multinod": 76, "multinomi": 6, "multipl": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 16, 17, 26, 27, 28, 29, 40, 41, 70, 71, 75, 76, 77, 79, 82, 83, 87, 88, 92, 93, 97], "multiple_profil": [29, 73, 77, 80, 93], "multipli": [5, 17, 28, 82], "multiply_and_lora": 83, "multiply_collect": 83, "multiprocessor": 16, "munsign": 1, "musecrossattent": 1, "musedynamictre": 0, "musegemmallreduceplugin": 1, "musegptattentionplugin": 1, "musegpudirectstorag": 0, "museloraplugin": 1, "musemambaconv1dplugin": 1, "musemrop": 1, "musepositionembed": 1, "museshapeinfer": 1, "musetokentypeembed": 1, "must": [0, 1, 2, 3, 4, 5, 6, 9, 10, 12, 16, 18, 29, 30, 32, 45, 70, 77, 82, 83, 85, 87, 90, 92], "mutabl": [0, 1], "mutablepageptr": 1, "mutex": [0, 1], "mutual": [6, 90], "muvm": 1, "muvmdiff": 1, "mverificationsets": 0, "mversion": 1, "mvocabs": 1, "mvocabsizepad": 1, "mweight": 0, "mwindows": 0, "mworkerexecutablepath": 0, "mworldconfig": 1, "my": [1, 40, 42, 43, 44, 46, 47, 48, 49, 50, 52, 54, 59, 66, 67, 73, 88, 94], "my_faster_on": 36, "my_model": 14, "my_profile_export": [30, 38, 39], "myattent": 95, "mybatchedlogitsprocessor": 52, "myconfig": 95, "mydecoderlay": [14, 95], "mylogitsprocessor": 52, "mymodel": [14, 95], "mymodelforcausallm": [14, 95], "n": [1, 2, 5, 10, 12, 15, 16, 27, 28, 30, 42, 45, 46, 47, 48, 49, 50, 52, 55, 56, 57, 58, 59, 69, 70, 73, 75, 79, 82, 83, 84, 89, 90, 92, 93], "n_worker": 70, "na": [73, 93], "naiv": 80, "naivepatternrewriter_replaceaddwithsub": 7, "name": [0, 1, 3, 6, 7, 10, 15, 16, 30, 31, 40, 42, 43, 44, 46, 47, 48, 49, 50, 52, 54, 56, 59, 66, 67, 69, 70, 73, 74, 75, 82, 84, 85, 86, 87, 88, 92, 93, 94, 95], "named_network_output": 92, "named_paramet": 17, "namespac": [0, 1, 69, 84], "nation": 73, "nationwid": 73, "nativ": [19, 22, 28, 93, 95], "native_quant_flow": 84, "natur": [19, 28, 34, 61, 75], "naur": [0, 3, 70], "nb": 84, "nbattentionlay": [0, 1], "nbdim": 1, "nbhead": 1, "nbkvhead": [0, 1], "nbkvheadperlay": 0, "nblayer": 1, "nbrnnlayer": 1, "nccl": [16, 26, 29, 82, 92, 93], "nccl_p2p_level": 93, "nccl_plugin": 29, "ncclplugin": 16, "ncclrecv": 82, "ncclsend": 82, "nd": [73, 82], "ndarrai": [82, 83, 87], "ndim": 82, "nearest": [28, 70, 82], "nearli": [7, 22, 28], "necess": 12, "necessari": [1, 4, 12, 26, 28, 58, 77, 82, 93, 98], "necessarili": [1, 16, 89], "need": [1, 2, 3, 5, 6, 7, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 26, 27, 28, 30, 31, 36, 40, 45, 50, 55, 56, 57, 58, 65, 66, 67, 69, 70, 71, 73, 74, 75, 76, 77, 79, 80, 81, 82, 84, 85, 87, 88, 89, 92, 93, 95, 96, 97, 98, 99], "needed_block": 99, "needsdecoderprologu": 1, "needskvcacherewind": 1, "neg": [1, 70, 81, 82], "neglig": [9, 25, 79], "neither": [3, 82, 89], "nemo": [15, 18, 29, 71, 75, 87, 90, 91, 93], "nemo_ckpt_dir": 84, "nemo_prompt_convert": 87, "nemotron": [91, 93], "nemotron_na": 93, "nemotronforcausallm": 91, "nemotronna": [91, 93], "nemotronnasforcausallm": 91, "neox": [5, 6, 90, 91, 93], "nest": 7, "net": [9, 70, 92], "net_guard": 7, "network": [3, 4, 5, 7, 11, 16, 18, 19, 28, 29, 45, 82, 88, 89, 90, 92, 93], "neural": [4, 7, 16, 88, 93], "neva": [91, 93], "never": [7, 73, 81], "new": [0, 1, 3, 5, 6, 7, 9, 10, 12, 13, 19, 22, 23, 26, 27, 30, 31, 33, 35, 42, 46, 47, 48, 49, 50, 52, 60, 62, 64, 65, 69, 70, 71, 79, 80, 82, 87, 88, 93, 94, 96, 98], "new_decoder_architectur": [15, 84], "new_generated_id": 87, "new_input": 7, "new_out": 7, "new_shap": 82, "new_tensor": 82, "new_token": 87, "new_workflow": 93, "newactiverequestsqueuelatencym": [0, 30], "newer": [91, 93], "newest": [23, 70], "newli": [0, 70, 79], "newsiz": 1, "newtoken": 1, "newtokensstep": 1, "newtokensvec": 1, "newvalu": 0, "next": [1, 10, 12, 16, 19, 22, 27, 64, 65, 71, 76, 77, 79, 80, 81, 87, 89, 91, 93], "next_logit": 87, "next_medusa_input_id": 87, "next_medusa_logit": 87, "next_step_buff": 87, "next_step_tensor": 87, "nextdraftindic": 1, "nextdraftlen": 1, "nextdraftpath": 1, "nextdraftprob": 1, "nextdrafttoken": 1, "nextdrafttokenslen": 1, "nextflattoken": 1, "nextgenerationlength": 1, "nextn": 27, "nextpositionoffset": 1, "ngc": [66, 67, 88, 93, 94], "ngoanpv": 93, "ngram": [0, 6, 70, 84], "ngramdecodingconfig": 70, "ngramsiz": 0, "ngroup": 82, "nhead": 82, "nhere": 45, "ni": [45, 90], "nine": 88, "nj": 48, "njane": [42, 46, 47, 48, 49, 50, 52], "njason": 58, "nmh": 87, "nmt": [87, 91, 93], "nn": [82, 95], "no_quant": 70, "no_repeat_ngram_s": [6, 70, 87], "no_schedule_after_st": 99, "no_schedule_until_st": 99, "noauxtckernel": 26, "node": [0, 2, 6, 11, 18, 27, 28, 29, 55, 56, 57, 64, 69, 70, 72, 75, 76, 82, 87, 90, 92, 93], "noexcept": [0, 1], "nomin": [42, 46, 47, 48, 49, 50], "non": [0, 2, 5, 8, 13, 16, 19, 25, 26, 27, 28, 29, 52, 70, 82, 92, 93], "non_block": 52, "non_gated_vers": 82, "none": [1, 6, 7, 14, 17, 19, 29, 30, 36, 51, 52, 53, 54, 58, 59, 70, 73, 75, 79, 82, 83, 84, 85, 86, 87, 92, 93, 95, 97], "nonetyp": [70, 87], "nonzero": 82, "nor": 89, "norepeatngrams": [0, 1, 6], "norm": [17, 20, 28, 56, 72, 73, 74, 75, 82, 93, 95], "norm_before_bmm1": [83, 84], "norm_elementwise_affin": 83, "norm_ep": 83, "norm_epsilon": [15, 84], "norm_factor": 5, "norm_num_group": 83, "norm_pre_residual_weight": 82, "norm_quant_fus": 29, "norm_typ": 83, "norm_weight": 82, "normal": [0, 6, 9, 10, 13, 25, 26, 27, 28, 70, 73, 82, 89, 93], "normalize_log_prob": 70, "normalize_weight": 10, "normalized_shap": [82, 83], "normalizelogprob": [0, 1], "normedscorescba": 1, "north": [14, 16, 92], "northeastern": 88, "not_op": 82, "notabl": 25, "notat": 27, "note": [1, 2, 7, 9, 10, 11, 12, 16, 20, 23, 25, 26, 27, 28, 29, 32, 36, 51, 55, 56, 57, 59, 64, 65, 70, 73, 74, 77, 79, 81, 82, 85, 87, 89, 90, 91, 92, 94, 95, 98], "notic": [51, 58], "notifysyncmessag": 0, "notimplementederror": 19, "nougat": [90, 91, 93], "nour": 59, "now": [6, 12, 15, 17, 21, 26, 27, 71, 73, 79, 85, 88, 89, 93], "np": 82, "npy": 87, "npytorch_backend_config": 30, "nsight": 64, "nsy": 72, "ntask": [16, 30, 55, 56, 57], "null": [1, 15, 73, 88], "nullopt": [0, 1], "nullptr": [0, 1], "num": [0, 1, 20, 54, 56, 64, 70, 72, 73, 74, 75, 77, 78, 80], "num_attention_head": [15, 82, 83, 84], "num_aud_token": 87, "num_beam": [6, 87], "num_beam_group": 6, "num_block": [87, 98], "num_blocks_per_cache_level": 51, "num_bucket": [82, 83], "num_channel": [83, 84], "num_class": 83, "num_context": 97, "num_ctx_token": 97, "num_draft_token": [0, 82, 87], "num_eagle_lay": [43, 44, 70], "num_embed": 83, "num_experts_per_tok": 4, "num_gener": 97, "num_group": [82, 83], "num_head": [5, 17, 82, 87, 97], "num_hidden_lay": [15, 84, 95, 98], "num_imag": 87, "num_img_token": 87, "num_key_value_head": [15, 84, 98], "num_kv_head": [8, 82, 83, 87, 97, 98], "num_kv_heads_origin": 82, "num_kv_heads_per_cross_attn_lay": 87, "num_kv_heads_per_lay": 87, "num_lay": [82, 83, 87, 98], "num_ln_in_parallel_attn": 84, "num_local_block": 83, "num_local_expert": 4, "num_lora_module_lay": 10, "num_lora_modules_lay": 10, "num_medusa_head": [54, 70, 84, 86, 87], "num_medusa_lay": [84, 86], "num_multimodal_token": 0, "num_nextn_predict_lay": [20, 27, 70], "num_orig_po": 82, "num_po": 82, "num_postprocess_work": 30, "num_profil": 84, "num_q_head": 26, "num_request": [20, 27, 73, 74], "num_return_sequ": [87, 93], "num_sampl": 72, "num_task": 83, "num_token": [5, 26, 82, 97], "num_tokens_per_block": [82, 98], "num_tokens_per_task": 83, "num_video": 87, "numa": 11, "numacceptedtoken": 0, "numactiverequest": 0, "numattentionhead": 1, "numavailablepag": 1, "numbeamscba": 1, "number": [0, 1, 2, 3, 4, 5, 6, 8, 12, 16, 20, 24, 26, 27, 28, 29, 30, 52, 55, 56, 57, 70, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 87, 89, 90, 92, 93, 95, 97, 98], "numblockspercachelevel": 0, "numcompletedrequest": 0, "numcontextrequest": [0, 1], "numcopystream": [0, 1], "numctxsequ": 1, "numctxtoken": 0, "numdevicemodulelay": 0, "numdrafttoken": [0, 1], "numdrafttokenshost": 1, "numeaglelay": 1, "numel": 87, "numensurework": 0, "numer": [6, 11, 26, 64, 73, 88, 91], "numexpert": 1, "numgeneratedtoken": 0, "numgenrequest": 0, "numgensequ": 1, "numgentoken": 0, "numhead": 6, "numhostmodulelay": 0, "numkvattentionhead": 1, "numkvhead": 6, "numlanguag": 1, "numlay": 6, "nummissedblock": 0, "numnewactiverequest": 0, "numnewallocatedblock": 0, "numnewtokenscumsum": 93, "numnod": [0, 93], "numpag": 1, "numpausedrequest": 0, "numpi": [10, 82, 87], "numputwork": 0, "numqueuedrequest": [0, 93], "numrequestswithdrafttoken": 0, "numreturnbeam": 0, "numreturnsequ": [0, 1, 3], "numreusedblock": 0, "numscheduledrequest": 0, "numsequ": 1, "numslot": 1, "numtoken": 1, "numtotalallocatedblock": 0, "numtransformerslay": 1, "nvcc": 20, "nvcr": 93, "nvfp4": [26, 29, 59, 64, 70, 73, 93, 94], "nvidia": [15, 16, 18, 19, 20, 21, 22, 23, 25, 27, 29, 31, 33, 34, 35, 37, 38, 39, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 65, 67, 71, 73, 74, 75, 80, 82, 88, 89, 91, 92, 93, 94], "nvila": [91, 93], "nvinfer1": [0, 1], "nvl": [1, 29, 93], "nvl36": 76, "nvl72": [28, 76], "nvlink": [2, 6, 11, 75, 76, 78, 93], "nvswitch": [16, 26], "nvtx": 70, "nyou": 45, "o": [0, 1, 7, 10, 19, 24, 26, 28, 55, 56, 57, 72, 92], "o_proj": 17, "oai": [34, 61], "obei": 92, "object": [0, 1, 3, 9, 14, 16, 17, 19, 36, 45, 70, 82, 83, 84, 85, 87, 88, 89, 96], "observ": [28, 51, 74], "obtain": [2, 18, 74, 82], "obviou": 28, "occas": 92, "occasion": 93, "occup": [5, 89], "occupi": [25, 28, 89], "occur": [6, 9, 98, 99], "odd": 52, "off": [9, 28, 72, 77, 79, 80, 89, 93], "offer": [16, 18, 25, 26, 71, 97], "offic": 45, "officenetsecur": 45, "offici": [5, 20, 27, 73], "offlin": [14, 23, 28, 40, 73, 74, 93], "offload": [0, 8, 13, 29, 64, 70, 93], "offset": [1, 82, 87, 90, 93], "offsetdim": 1, "ofitensor": 0, "often": [0, 3, 8, 12, 21, 25, 26, 70, 76, 77, 82], "ok": 92, "okai": 51, "old": [7, 10, 27, 92], "older": [9, 19, 65, 91], "oldest": [10, 70], "oldvalu": 0, "omit": [1, 3, 19, 82], "ompi": [67, 92], "onboard": [0, 9, 70, 89], "onboard_block": 70, "onboardblock": 0, "onc": [0, 3, 5, 6, 7, 16, 18, 65, 69, 70, 77, 82, 89], "one": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 19, 21, 26, 27, 28, 29, 30, 31, 58, 69, 70, 73, 75, 76, 77, 80, 81, 82, 83, 85, 87, 89, 92, 93, 95, 99], "ones": [0, 10], "oneshot": [26, 82], "oneshotallreduc": 26, "oneshotar": 26, "onevis": [91, 93], "ongo": [19, 59], "onli": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 19, 20, 25, 27, 28, 29, 30, 36, 52, 59, 64, 69, 70, 73, 74, 75, 76, 77, 79, 80, 81, 82, 83, 85, 87, 89, 91, 93, 96, 99], "onlin": [18, 23, 40], "only_cross_attent": 83, "onnx": [29, 82], "onnx__gathernd": 82, "onto": 6, "oom": [20, 21, 24, 28, 89], "ootb": [28, 93], "op": [0, 1, 7, 28, 70, 82, 93], "op_and": 82, "op_or": 82, "op_xor": 82, "opaqu": 7, "opaque_st": 70, "open": [6, 21, 26, 28, 59, 71, 72, 92, 93], "openai": [30, 63, 88, 93], "openipc": 1, "openmpi": 93, "opensora": 93, "openssh": 31, "oper": [0, 1, 3, 5, 6, 7, 11, 12, 15, 16, 17, 26, 28, 29, 52, 70, 73, 76, 77, 80, 82, 88, 89, 91, 93, 96, 97, 98], "opportun": 73, "opt": [3, 15, 25, 28, 31, 82, 90, 91, 92, 93], "opt_batch_s": [70, 84], "opt_num_token": [29, 70, 84], "optforcausallm": [15, 84], "optim": [1, 2, 3, 6, 7, 8, 11, 12, 16, 18, 19, 21, 22, 23, 24, 25, 29, 46, 52, 54, 65, 69, 70, 71, 73, 74, 76, 77, 78, 82, 88, 89, 91, 92, 93, 94, 96, 97, 98], "optimaladapters": [0, 1], "option": [0, 1, 3, 6, 7, 8, 11, 12, 14, 19, 22, 27, 29, 30, 36, 52, 56, 58, 64, 67, 70, 72, 73, 74, 75, 76, 78, 79, 82, 85, 87, 89, 92, 93, 95, 97, 98], "optionalbufferptr": 1, "optionaltensorptr": 1, "optmodel": 84, "optvec": 1, "orchestr": [0, 2, 12, 92, 93], "orchestratorconfig": 0, "orchleadercomm": 0, "order": [0, 1, 2, 5, 8, 17, 21, 70, 73, 74, 77, 81, 82, 83, 89], "org": [0, 1, 4, 10, 29, 66, 67, 82, 90], "organ": [8, 71, 98], "orient": 28, "origin": [0, 5, 7, 10, 11, 27, 28, 82, 93, 95], "original_max_position_embed": [82, 83], "originaltemperatur": 1, "oserror": 93, "osl": [21, 22, 23, 24, 26, 27, 28, 73, 74, 80], "ostream": [0, 1], "other": [0, 1, 2, 3, 4, 5, 6, 9, 11, 12, 16, 17, 19, 21, 26, 27, 28, 29, 36, 49, 51, 55, 56, 57, 59, 65, 69, 70, 71, 74, 75, 76, 77, 79, 80, 81, 82, 85, 89, 92, 93, 97, 99], "other_audio_input": 87, "other_decoder_input": 87, "other_vision_input": 87, "othercach": 1, "otherwis": [0, 1, 3, 5, 6, 36, 70, 73, 82, 87, 92, 97], "our": [20, 25, 26, 27, 28, 42, 45, 46, 47, 49, 50, 73, 74, 77, 79, 80, 82, 91, 92, 93, 95], "out": [0, 1, 2, 10, 19, 21, 22, 23, 24, 26, 27, 28, 40, 55, 56, 57, 69, 72, 74, 77, 79, 80, 82, 88, 89, 93], "out_bia": 83, "out_channel": 83, "out_context_dim": 83, "out_dim": 83, "out_fatur": 15, "out_featur": [15, 16, 83], "out_hidden_s": 82, "out_of_tree_exampl": 95, "out_point": 82, "out_tp": [21, 24], "outdim": 1, "outdimfirst": 1, "outer": 82, "outlin": 72, "output": [0, 1, 2, 5, 6, 7, 9, 10, 12, 16, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 36, 38, 39, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 56, 58, 59, 66, 67, 70, 72, 74, 75, 76, 77, 78, 80, 81, 82, 83, 87, 88, 92, 93, 94, 96, 97, 99], "output_cum_log_prob": 87, "output_dim": 83, "output_dir": [10, 13, 14, 15, 16, 19, 29, 73, 76, 84, 86, 88, 92], "output_dtyp": [82, 83], "output_generation_logit": 87, "output_id": 87, "output_log_prob": 87, "output_multiplier_scal": 84, "output_pad": [82, 83], "output_s": 83, "output_seqlen": [21, 24], "output_sequence_length": 87, "output_timing_cach": [29, 70], "output_token": 73, "outputbuff": 1, "outputconfig": [0, 3, 36, 93], "outputidscba": 1, "outputlen": 0, "outputlogprob": 1, "outputtokenid": [0, 3], "outsid": [12, 18, 19, 97], "outsiz": 1, "outstand": 27, "outtpsplitdim": 1, "outweigh": 76, "over": [0, 1, 9, 12, 17, 20, 22, 23, 25, 26, 28, 32, 70, 72, 73, 76, 79, 80, 82, 93], "overal": [3, 5, 9, 11, 12, 20, 27, 28, 71, 76, 77, 79, 80, 81, 95], "overcom": [5, 16, 26], "overflow": 1, "overhead": [0, 3, 16, 26, 27, 28, 76, 93, 97], "overiew": 73, "overlap": [0, 2, 12, 20, 26, 27, 28, 70, 93, 99], "overload": [0, 1], "overrid": [1, 17, 19, 36, 70, 82, 87], "override_field": 84, "overshadow": 76, "oversubscrib": [69, 75], "overview": [3, 8, 20, 25, 64, 65, 72, 73, 75, 94, 96], "overwhelm": 58, "overwrit": [5, 30], "own": [0, 1, 2, 9, 12, 15, 16, 17, 18, 19, 20, 27, 36, 65, 95], "ownership": 0, "ownsev": 1, "ownsstream": 1, "p": [0, 6, 12, 18, 31, 55, 56, 57, 70, 84, 87, 93], "p2p": 82, "p50": [73, 74], "p90": [73, 74, 75], "p95": [73, 74, 75], "p99": [73, 74, 75], "p_max": 0, "p_x": 0, "pack": [0, 1, 6, 29, 64, 81, 82, 84, 89, 95], "packag": [3, 65, 66, 67, 73, 75, 92, 93], "packed_length": 84, "packedinput": 1, "packedmask": 1, "packedmaskhost": 1, "packedmaskhostcopi": 1, "packedmasksdevic": 1, "packedpositionid": 1, "pad": [0, 1, 6, 7, 10, 28, 29, 30, 64, 70, 71, 82, 83, 87, 89, 93], "pad_id": [70, 87], "pad_lda": 83, "pad_ldc": 83, "pad_token_id": 87, "padding_2d": 82, "padding_back": 82, "padding_bottom": 82, "padding_front": 82, "padding_left": 82, "padding_mod": 83, "padding_right": 82, "padding_top": 82, "padid": 0, "page": [1, 2, 6, 9, 16, 23, 29, 64, 69, 73, 75, 77, 82, 88, 89, 93, 97], "paged_context_fmha": [77, 93], "paged_kv_cach": [10, 29, 73, 87], "paged_st": [29, 87], "pagedcontextfmha": 1, "pagedkvcach": 6, "pagedst": 1, "pageid": 1, "pageidx": 1, "pagemanagerconfig": 1, "pageptr": 1, "pagewidth": 1, "pair": [0, 1, 21, 70, 77, 80, 82], "pale": 51, "paper": [2, 10, 12, 22, 27, 28, 90, 97], "par": [79, 80], "parallel": [0, 2, 3, 5, 6, 12, 15, 16, 20, 21, 23, 24, 27, 30, 40, 41, 50, 52, 64, 70, 74, 77, 78, 82, 83, 84, 89, 93, 95, 99], "parallel_attent": [15, 84], "parallelconfig": [0, 93], "param": [0, 1, 17, 46, 47, 48, 50, 51, 59, 70, 82, 83, 84, 87], "paramet": [0, 1, 3, 4, 5, 8, 9, 10, 12, 13, 15, 16, 17, 19, 20, 28, 29, 30, 55, 70, 73, 76, 77, 78, 81, 82, 83, 84, 87, 89, 93, 97], "parametr": 87, "parent": [0, 1, 17, 19], "parent_hash": 51, "parenthash": 0, "parentid": 1, "pari": [42, 46, 47, 48, 49, 50, 59], "pars": [1, 70], "parse_arg": 54, "parser": [30, 54, 63], "part": [1, 3, 4, 7, 16, 17, 19, 28, 64, 65, 69, 70, 71, 74, 79, 80, 81, 82, 87, 89], "part2": 93, "parti": 93, "partial": [0, 4, 9, 16, 26, 70, 76], "particip": [0, 59, 82, 93], "participantid": [0, 2], "particular": [0, 3, 69, 78, 79, 80, 88], "particularli": [26, 28, 65, 80, 98], "partit": [5, 10, 16, 55, 56, 57], "pass": [0, 1, 3, 5, 7, 9, 10, 12, 16, 17, 36, 52, 58, 59, 70, 72, 73, 75, 77, 79, 80, 82, 83, 84, 87, 89, 93, 94, 95, 96, 97, 99], "past": [0, 5, 27], "past_key_valu": [82, 83], "past_key_value_length": 83, "past_key_values_length": 83, "past_kv_length": 87, "past_sequence_length": 87, "patch": [83, 87], "patch_siz": [83, 84], "path": [0, 1, 3, 5, 12, 15, 17, 20, 27, 29, 30, 36, 46, 47, 48, 49, 50, 54, 55, 56, 57, 59, 65, 69, 70, 72, 73, 74, 75, 77, 82, 87, 93], "path_to_llama_from_hf": 96, "path_to_meta_llama_from_hf": 69, "path_to_trt_engin": 69, "pathlib": [54, 70], "pathlik": 84, "pathorn": 93, "pathsoffset": 1, "pattern": [4, 26, 28, 64, 70, 82, 93], "patternanalyz": 7, "patternrewrit": 7, "paus": [0, 81, 99], "paused_request": 99, "pcie": [11, 29], "pdf": [0, 4, 10], "pdl": [26, 93], "peak": [0, 20, 21, 22, 26, 74], "peft": 70, "peft_cache_config": [36, 49, 70], "peftcacheconfig": [0, 70], "peftcachemanag": [0, 93], "penal": [0, 6, 70], "penalti": 93, "penalty_alpha": 6, "pend": 99, "pending_request": 99, "per": [0, 1, 3, 5, 6, 8, 11, 12, 16, 19, 20, 21, 23, 24, 26, 27, 28, 29, 30, 55, 56, 57, 70, 73, 74, 75, 76, 77, 82, 83, 89, 90, 93], "per_channel": 90, "per_group": 90, "per_token": 90, "per_token_scal": 82, "perceiv": 22, "percent": [0, 13], "percentag": [10, 13, 73, 74, 75], "percentil": [73, 93], "perf": [0, 20, 28, 30, 63, 70, 82, 93], "perf_best_practic": 93, "perform": [0, 1, 2, 3, 5, 6, 7, 10, 16, 17, 18, 19, 21, 23, 24, 27, 28, 29, 30, 36, 65, 69, 70, 71, 73, 74, 76, 79, 81, 82, 87, 88, 91, 93, 95, 97, 98], "performantli": 21, "permut": 82, "persimmon": 93, "persist": [25, 69], "person": [31, 58], "phase": [0, 2, 7, 12, 21, 24, 26, 27, 28, 29, 64, 73, 78, 79, 80, 81, 82, 89, 93, 97, 98], "phi": [69, 82, 90, 91, 93], "phi3config": 84, "phi3forcausallm": 84, "phi3model": 84, "phiconfig": 84, "phiforcausallm": 84, "phimodel": 84, "physic": [82, 89], "picasso": 59, "pick": 79, "pickl": 93, "piec": 79, "piecewis": 70, "pin": [0, 1, 9], "ping": 93, "pinnedmemusag": 0, "pinnedpool": 1, "pip": [20, 30, 65, 66, 67, 88, 93], "pip3": [66, 67], "pipelin": [0, 1, 3, 6, 16, 21, 24, 29, 30, 50, 70, 73, 74, 78, 89, 93, 99], "pipeline_parallel_s": [50, 70, 76, 77], "pipelineparallel": [0, 1, 6], "pipelineparallelismrank": 1, "pitfal": [9, 19], "pixart": 83, "pixartalphatextproject": 83, "pixel_valu": 84, "pl": [67, 73], "place": [1, 29, 51, 67, 82, 93, 95], "placement": 26, "plai": 79, "plan": [3, 5, 26, 65], "planner": 93, "platform": [31, 32, 42, 46, 47, 49, 50, 65, 71, 73, 93, 94], "pleas": [2, 5, 7, 11, 12, 14, 21, 23, 24, 25, 26, 28, 32, 36, 45, 52, 65, 67, 73, 74, 76, 78, 82, 92, 93, 94, 99], "plu": [11, 87], "plugin": [5, 6, 7, 13, 15, 64, 65, 70, 79, 82, 84, 88, 89, 90, 92, 93], "plugin_config": [70, 77, 80, 82, 84], "plugin_namespac": 7, "plugin_typ": 7, "plugin_v2": 7, "plugin_v2_gemm_0": 92, "pluginconfig": [70, 85], "pluginconfigmeta": 85, "pluginfield": 93, "pluginv2build": 92, "pm": [20, 26, 73], "pmi": 92, "pmi2_init": 92, "pmix": [16, 30, 55, 56, 57, 92], "png": [34, 39, 61], "po": 83, "point": [1, 5, 16, 18, 22, 25, 40, 45, 50, 66, 67, 69, 70, 74, 76, 81, 82, 88, 90, 92, 93], "pointer": [0, 1, 6, 17, 82, 87, 93], "pointerelementtyp": 1, "polar": 91, "polici": [0, 1, 2, 70, 73, 75, 89], "poll": [0, 30], "polyhedr": 16, "pong": 93, "pool": [0, 1, 5, 28, 64, 70, 82, 87, 98, 99], "pooled_project": [83, 84], "pooled_projection_dim": 83, "pooledpin": 0, "poor": 2, "popd": 92, "popfirstgentoken": 0, "popul": [1, 5, 16, 59, 82], "popular": [5, 15, 19, 25, 27, 32, 69], "port": [0, 30, 32, 37], "portfolio": 23, "portion": [4, 76, 82, 89], "pos_emb_typ": 82, "pos_embd_param": 97, "pos_embed_max_s": 83, "pos_embed_typ": 83, "pose": 80, "posit": [0, 1, 12, 26, 27, 70, 73, 82, 83, 87, 93, 97], "position_embed": [82, 83], "position_embedding_typ": [5, 15, 82, 83, 84], "position_encoding_2d": 84, "position_id": [84, 87, 92, 95, 97], "positionalembeddingparam": 97, "positionembeddingtyp": [5, 82, 83, 84], "positionid": [0, 1], "positionidsbas": 1, "positionidsdevic": 1, "positionidshost": 1, "positionidshostcopi": 1, "positionoffset": 1, "positionoffsetsdevic": 1, "positionoffsetshost": 1, "positionoffsetshostcopi": 1, "posix": 0, "posix_debug_fallback": 0, "possibl": [2, 3, 5, 6, 9, 12, 16, 20, 27, 28, 29, 36, 65, 71, 72, 73, 74, 77, 79, 81, 82, 89, 92, 93, 96], "possibli": [1, 8, 82], "post": [0, 15, 22, 25, 26, 27, 28, 59, 71, 72, 82, 88, 93], "post_act_fn": 83, "post_attention_layernorm": [17, 95], "post_input_id": 87, "post_layernorm": [14, 15, 17, 82, 92], "post_pad": 82, "post_prompt": 87, "post_strid": 82, "posterior_threshold": [43, 44, 70], "posterioralpha": 1, "posterioralphahost": 1, "posteriorthreshold": [0, 1], "posteriorthresholdhost": 1, "postprocess": [30, 83], "postprocessor": [0, 70], "postprocparam": 70, "potenti": [0, 1, 8, 12, 28, 29, 72, 73, 77, 95], "pow": 82, "power": [9, 16, 23, 25, 26, 28, 71, 79, 93], "pp": [0, 2, 6, 10, 21, 24, 30, 73, 75, 82, 93], "pp2": 73, "pp_communicate_final_output_id": 87, "pp_communicate_new_token": 87, "pp_reduce_scatt": [29, 80], "pp_size": [15, 16, 30, 37, 73, 74, 76, 86, 93], "ppreducescatt": 1, "pr": 26, "practic": [5, 8, 16, 22, 23, 26, 28, 88, 89, 93], "pre": [0, 1, 3, 5, 15, 18, 65, 67, 70, 71, 73, 82, 88, 89, 93, 97], "pre_input_id": 87, "pre_layernorm": 82, "pre_onli": 83, "pre_pad": 82, "pre_prompt": 87, "pre_quant_scal": [15, 70], "pre_strid": 82, "prebuilt": 65, "preced": [16, 82], "precis": [1, 6, 17, 21, 25, 29, 64, 73, 77, 80, 85, 88, 89, 91, 93], "precompute_relative_attention_bia": 84, "precomputed_relative_attent": 83, "predefin": [12, 95, 97], "predict": [1, 5, 12, 26, 27, 93], "predicteddraftlogit": 1, "predictor": 12, "predictsdrafttoken": 1, "prefer": [25, 65], "prefer_managed_weight": 83, "prefer_plugin": 82, "prefetch": 26, "prefil": [0, 28, 70, 78], "prefix": [3, 12, 15, 27, 51, 69, 75, 82, 85, 92], "preliminari": [21, 23, 24], "preload": 17, "premis": 27, "prepar": [0, 1, 2, 26, 27, 51, 56, 64, 72, 79, 82, 84, 90, 93, 97], "prepare_dataset": [20, 56, 72, 73, 74, 75], "prepare_input": [84, 89], "prepare_position_ids_for_cogvlm": 87, "prepare_recurrent_input": 84, "prepare_resourc": [96, 98], "prepareforward": 1, "prepend": 92, "preprocess": [17, 87, 90], "preprocess_weights_hook": 84, "preprocessor": 73, "prequant_scaling_factor": 15, "prerequisit": [64, 66, 67], "presenc": [6, 16, 51], "presence_penalti": [70, 87, 93], "presencepenalti": [0, 1, 6], "present": [0, 70, 73, 79, 80, 90, 93], "preserv": 77, "presid": [40, 42, 43, 44, 46, 47, 48, 49, 50, 52, 54, 59, 66, 67, 75, 81, 88, 94], "pretrain": 18, "pretrained_config": 95, "pretrained_model_name_or_path": 84, "pretrainedconfig": [14, 19, 70, 84, 85, 95], "pretrainedmodel": [19, 84, 89], "pretrainedtokenizerbas": 70, "prevdrafttokenslen": 1, "prevent": [26, 28, 64, 69], "preview": 93, "previou": [1, 3, 4, 12, 19, 20, 22, 27, 73, 75, 76, 77, 79, 80, 81, 93], "previous": [1, 21, 77, 79, 81, 93], "prevscor": 1, "prewritten": 88, "price": 73, "primari": [0, 1, 8, 25, 89, 99], "primarili": 97, "primit": [16, 28, 71, 88], "print": [1, 5, 30, 36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 58, 59, 60, 61, 62, 66, 67, 70, 73, 74, 75, 81, 88, 89, 92, 94], "print_iter_log": [20, 56, 70], "prior": [3, 29, 65, 67], "priorit": [25, 79, 81], "prioriti": [0, 1, 8, 9, 17, 70], "prioritytyp": 0, "priorityupd": 0, "privat": [0, 1, 6, 70], "privileg": 7, "prm": 91, "pro": 26, "prob": 82, "probabilist": 83, "probabl": [0, 1, 6, 9, 12, 26, 27, 70, 82, 87, 93], "probil": 1, "problem": [5, 20, 28, 92], "proc": 17, "proccessed_weight": 17, "proccessed_zero": 17, "procedur": 20, "proceed": 16, "process": [0, 1, 2, 3, 5, 6, 8, 11, 12, 15, 16, 19, 20, 26, 27, 28, 29, 40, 45, 50, 52, 55, 56, 57, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 79, 80, 81, 82, 87, 88, 92, 93, 95, 96, 97, 99], "process_input": 87, "process_logits_including_draft": 87, "processor": [0, 5, 40, 41, 53, 70, 84, 87, 93], "processorbatch": 0, "processormap": 0, "prod": 82, "produc": [0, 1, 3, 7, 16, 36, 73, 75, 77, 79, 80, 82, 93], "product": [4, 5, 12, 16, 23, 71, 79, 80, 81, 82, 88, 97], "profil": [2, 29, 30, 38, 39, 64, 77, 79, 82, 87, 89, 92, 93], "profiling_verbos": [29, 70], "profit": [12, 73], "program": [2, 19, 40, 42, 46, 47, 49, 50, 52, 66, 67, 69, 81, 88, 92], "progress": [1, 26, 70, 73, 82], "proj": [15, 17, 92], "project": [5, 10, 28, 59, 65, 82, 83, 95, 98], "projector_hidden_act": 84, "prologu": [55, 56, 57], "promin": 12, "promis": [12, 19, 27], "prompt": [0, 3, 6, 9, 14, 20, 29, 30, 35, 36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 58, 59, 62, 64, 66, 67, 70, 73, 75, 79, 80, 81, 83, 87, 88, 93, 94, 97], "prompt_adapter_request": [70, 93], "prompt_embedding_t": [83, 84, 87], "prompt_embedding_table_s": 84, "prompt_id": 52, "prompt_len": 97, "prompt_logprob": 70, "prompt_lookup": [12, 93], "prompt_lookup_num_token": [6, 70], "prompt_tabl": 87, "prompt_task": [84, 87], "prompt_token": 88, "prompt_token_id": [36, 53, 70], "prompt_vocab_s": [84, 87], "promptadapterrequest": 70, "promptinput": [70, 93], "promptlen": 0, "prompttableoffload": 0, "prompttuningconfig": 0, "prompttuningembed": 83, "prompttuningen": 1, "pronounc": 12, "proof": 98, "propag": [9, 93], "proper": [2, 73], "properli": [17, 79, 81], "properti": [3, 45, 70, 82, 84, 85, 87], "proport": 5, "propos": [0, 26], "protect": [1, 40, 50, 66, 67, 69, 88], "protocol": [0, 30, 45], "proud": 26, "prove": [12, 28], "provid": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19, 20, 21, 22, 25, 26, 28, 29, 30, 31, 36, 45, 54, 59, 65, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 87, 89, 91, 92, 93, 95, 96, 97], "proxy_dispatch_result_thread": 73, "prune": [7, 12, 82], "pseudo": [5, 82, 90], "pth": [17, 93], "ptq": [25, 77, 93], "ptr": 1, "ptr_idx": 17, "ptrdiff_t": 1, "ptuning_setup": 87, "ptuning_setup_fuyu": 87, "ptuning_setup_llava_next": 87, "ptuning_setup_phi3": 87, "ptuning_setup_pixtr": 87, "ptuningconfig": 0, "public": [0, 1, 25, 32, 54, 59], "publish": [20, 21, 24, 73, 74, 93], "pull": [18, 20, 65, 88, 93], "puneeshkhanna": 93, "purchas": 73, "pure": 87, "purpos": [5, 8, 28, 65, 75, 77, 79, 80], "pursu": [42, 46, 47, 49, 50, 52], "push": [28, 31, 53], "pushd": 92, "put": [1, 15, 26, 55, 56, 57, 69, 71, 79], "pwd": [20, 65], "py": [3, 4, 5, 7, 10, 12, 13, 14, 15, 16, 17, 19, 20, 26, 27, 28, 52, 55, 56, 65, 67, 69, 72, 73, 74, 75, 76, 77, 82, 85, 87, 88, 92, 93, 95, 96, 98, 99], "py3": 93, "py_executor_cr": 99, "pybind": 93, "pybind11_object": 70, "pybindmirror": 70, "pydant": [70, 93], "pydantic_cor": 70, "pyexecutor": [93, 98, 99], "pynvml": 93, "pypi": [65, 93], "python": [1, 5, 6, 7, 10, 12, 14, 16, 18, 19, 20, 27, 28, 30, 36, 47, 48, 64, 66, 67, 69, 72, 73, 74, 75, 76, 88, 90, 93, 95, 96, 98, 99], "python3": [10, 13, 15, 20, 55, 56, 65, 67, 72, 73, 88, 92], "python_bind": 20, "python_e2": 87, "python_plugin": 93, "pythonpath": [20, 56, 57], "pytorch": [7, 12, 15, 18, 20, 27, 28, 30, 37, 51, 55, 56, 57, 64, 65, 66, 67, 70, 74, 82, 93, 96, 97, 98, 99], "pytorch_backend_config": 30, "pytorch_eagle_weights_path": 70, "pytorch_extra_arg": 56, "pytorch_model": 92, "pytorch_model_engin": 96, "pytorch_model_registri": 98, "pytorchconfig": [70, 97], "pytorchmodelengin": [96, 98], "pzzzzz5142": 93, "q": [2, 5, 6, 10, 21, 26, 28, 64, 73, 82, 92, 95, 97], "q_b_proj": 82, "q_dim": 82, "q_lora_rank": [82, 83], "q_proj": [17, 95], "q_scale": [5, 82, 83, 84], "qa": 12, "qformat": [73, 86], "qgmma": 93, "qingquansong": 93, "qk_layernorm": [83, 84], "qk_nope_head_dim": [82, 83], "qk_norm": 83, "qk_rope_head_dim": [82, 83], "qkv": [7, 10, 15, 17, 64, 82, 92, 93, 97], "qkv_bia": [82, 93], "qkv_dim": 82, "qkv_proj": 95, "qo_indptr": 97, "qpi": 11, "qserv": 93, "quadrat": [5, 89], "qualiti": [27, 77, 80], "qualnam": [70, 82, 84, 86], "quant": [19, 70, 73, 82, 93, 94], "quant_algo": [15, 17, 19, 36, 59, 70, 73, 77, 84], "quant_and_calib_config": 59, "quant_config": [19, 36, 59, 70, 77, 84, 97], "quant_medusa_head": 86, "quant_mod": [19, 70, 83, 84, 87], "quantalgo": [36, 59, 70, 77, 84, 86], "quantconfig": [19, 36, 59, 70, 77, 84, 93, 97], "quanticonfig": 19, "quantiz": [5, 6, 11, 16, 17, 20, 21, 22, 26, 28, 29, 40, 41, 46, 54, 64, 67, 68, 69, 70, 71, 74, 75, 78, 82, 83, 84, 87, 88, 91, 93, 95, 97], "quantizaton": 73, "quantize_and_export": 86, "quantize_kwarg": 84, "quantize_lm_head": [86, 93], "quantized_valu": 5, "quantizedkernel": 16, "quantizetensorplugin": 16, "quantmod": [1, 5, 6, 64, 70, 82, 83, 84, 86, 87], "quantmodewrapp": [70, 82], "queri": [3, 6, 8, 12, 16, 21, 28, 30, 64, 73, 82, 89, 97, 98], "query_dim": 83, "query_key_valu": 17, "query_length": 83, "query_pre_attn_scalar": 84, "question": [58, 73, 89, 92], "queu": [0, 74, 79], "queue": [0, 70, 71, 96], "quick": [5, 64, 71, 73, 75, 97], "quick_gelu": 82, "quicker": 76, "quickli": [19, 88], "quickstart": [69, 75], "quickstart_advanc": [27, 55], "quit": [7, 69], "qweight": 17, "qwen": [17, 30, 39, 69, 73, 82, 90, 91, 93], "qwen1": [91, 93], "qwen2": [10, 30, 34, 39, 61, 73, 91, 93], "qwen2_5_vlforconditionalgener": 91, "qwen2audio": 93, "qwen2forcausallm": 91, "qwen2forprocessrewardmodel": 91, "qwen2forrewardmodel": 91, "qwen2forsequenceclassif": 93, "qwen2vl": 93, "qwen2vlforconditionalgener": 91, "qwenforcausallm": 17, "qwenforcausallmgenerationsess": 87, "qwenvl": 93, "qwq": 91, "qychen": 10, "qzero": 17, "r": [1, 10, 30, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 58, 59, 66, 67, 75, 81, 82, 88, 92, 93, 94], "r1": [30, 63, 74, 93], "r1_in_tensorrt": [26, 93], "race": 93, "radix": 98, "rai": 1, "rais": [19, 70, 75, 92, 93], "rand": 82, "rand_data": 82, "rand_data_sampl": 84, "rand_data_valid": 84, "random": [0, 6, 30, 38, 39, 70, 74, 82, 93], "random_se": [70, 84, 87], "randomdatasampl": 1, "randomdatavalid": 1, "randomli": 74, "randomse": [1, 6, 93], "randomseedtyp": 0, "rang": [0, 6, 9, 12, 72, 80, 82, 84, 89, 90, 91, 92, 95], "rank": [0, 1, 2, 3, 4, 6, 10, 19, 20, 28, 29, 69, 73, 82, 84, 87, 89, 92, 93], "rank0": 15, "rank1": 15, "rapid": [12, 74, 88], "rate": [0, 20, 26, 27, 28, 30, 38, 39, 73, 74, 75, 93], "rather": [5, 7, 12, 28, 67, 71], "ratio": 28, "rational": 28, "raw": 30, "raw_audio": 87, "raw_imag": 87, "rdma": 2, "re": [20, 25, 70, 71, 93, 97], "reach": [0, 5, 15, 69, 73, 77, 81], "read": [0, 2, 3, 5, 12, 14, 16, 17, 20, 26, 27, 29, 58, 70, 73, 93], "read_config_from_the_custom_training_checkpoint": 19, "readabl": 73, "reader": 82, "readi": [0, 88], "readm": [2, 12, 30, 69, 75, 93], "real": [7, 20, 26, 65, 75, 77, 79, 80, 82, 92], "realiti": 79, "realiz": [9, 12], "rearrang": 82, "reason": [0, 5, 6, 16, 19, 26, 27, 30, 63, 70, 73, 76, 79, 80, 82, 92], "reasoning_pars": [30, 37], "rebuild": [80, 82, 92], "receiv": [0, 1, 2, 3, 4, 11, 12, 77, 82, 93], "recent": [1, 4, 5, 22, 26], "recip": [26, 28, 30, 70, 90], "reclaim": 0, "recogn": [12, 26, 73, 95], "recommend": [2, 5, 6, 12, 14, 17, 18, 20, 22, 25, 28, 30, 52, 65, 70, 73, 78, 79, 81, 92, 93, 95, 97], "recompute_scale_factor": 82, "reconfigur": [3, 67], "reconstruct": [5, 82], "record": [1, 7, 20, 26, 27, 70], "recored": 0, "recreat": 18, "recurr": 12, "recurrentgemma": [90, 91, 93], "recurrentgemmaforcausallm": 84, "recurs": [20, 65, 69], "recv": [0, 16, 82], "recvconnect": 0, "recvpollperiodm": 0, "recycl": [5, 98], "redesign": 93, "redirect": [7, 70], "redraft": [64, 82, 87, 93], "redrafter_draft_len_per_beam": 87, "redrafter_inverted_temperatur": 84, "redrafter_num_beam": 87, "redrafterforcausallm": 84, "reduc": [2, 3, 4, 5, 9, 11, 12, 16, 20, 21, 24, 26, 27, 28, 29, 65, 69, 71, 72, 73, 74, 75, 76, 79, 81, 82, 89, 92, 93, 97], "reduce_fus": [29, 73, 77, 80], "reduce_scatt": 82, "reduceoper": 82, "reducescatt": [29, 80, 93], "reduct": [11, 12, 26, 81, 82], "redund": [12, 26], "refactor": [19, 27, 93], "refer": [0, 1, 2, 3, 5, 6, 7, 8, 10, 12, 16, 18, 19, 20, 30, 32, 33, 34, 35, 36, 37, 38, 39, 40, 52, 60, 61, 62, 65, 69, 71, 73, 74, 75, 76, 77, 78, 80, 82, 88, 91, 93, 95, 97], "referenc": 77, "reference_wrapp": [0, 3], "refin": 93, "refit": [16, 29, 93], "refit_engin": 16, "reflect": 79, "refresh": 73, "regard": 82, "regardless": 92, "regex": [3, 70], "region": 72, "regist": [31, 64, 92, 93, 95], "register_auto_model": 95, "register_network_output": 92, "registerdesc": 0, "registermemori": 0, "regress": [5, 6, 16], "regular": [0, 3, 5, 26, 70, 82], "reinforc": 78, "reject": [0, 27], "rel": [9, 21, 79, 81, 82, 93], "rel_attn_t": 83, "relat": [2, 4, 8, 17, 64, 71, 72, 82, 85, 89, 92, 93, 94, 95, 98], "relationship": 89, "relative_attent": [82, 83], "relative_attention_bia": 82, "relax": 5, "relaxed_delta": [26, 27, 70], "relaxed_topk": [26, 27, 70], "releas": [1, 5, 6, 8, 19, 21, 24, 25, 64, 71, 82, 84, 89, 90, 91], "release_build": 65, "release_run": [65, 88], "releasepag": 1, "releasest": 0, "relev": [6, 65, 98], "reli": [2, 5, 7, 19, 69, 72, 90], "reload": 3, "relu": [15, 16, 82, 92], "remain": [0, 7, 9, 12, 13, 26, 65, 74, 75, 77, 79, 80, 82, 89, 93], "remaind": 77, "remark": [26, 27], "remind": [5, 97], "remot": 70, "remotenam": 0, "remov": [0, 1, 5, 6, 7, 8, 16, 17, 20, 27, 29, 30, 54, 65, 70, 71, 77, 82, 89, 93, 95], "remove_const_t": 1, "remove_cv_t": 0, "remove_duplicated_kv_head": 84, "remove_input_pad": [5, 10, 29, 82, 83, 87], "remove_pointer_t": 1, "remove_reference_t": 1, "remove_sequ": 98, "renam": 93, "reorder": [82, 83], "reorder_kv_cache_for_beam_search": 87, "rep": 72, "repeat": [0, 5, 27, 28, 70, 82], "repeat_interleav": 82, "repeatedli": 12, "repetit": [0, 6, 70, 82], "repetition_penalti": [6, 70, 87, 93], "repetitionpenalti": [0, 1, 6], "replac": [1, 4, 7, 16, 17, 19, 20, 28, 73, 75, 77, 81, 82, 89, 95], "replace_add_with_sub": 7, "replace_all_uses_with": [7, 82], "replace_input_with": 7, "replace_output_uses_with": 7, "replace_outputs_uses_with": 7, "replic": [0, 3, 26, 82], "replit": [90, 91, 93], "repo": [19, 69, 71, 75, 92], "repo_id": 58, "report": [8, 27, 28, 72, 73, 74, 89, 93], "reportpluginerror": 92, "repositori": [12, 18, 20, 31, 69, 88], "repres": [0, 1, 2, 8, 12, 20, 21, 25, 26, 45, 58, 70, 73, 79, 82, 87, 99], "represent": [7, 16], "reproduc": [64, 73, 93], "req": [20, 73, 74, 75, 77, 79, 80], "req_id": 52, "req_logit": 52, "req_stat": 99, "req_token_id": 52, "reqbeamwidth": 1, "reqid": 0, "reqpromptlength": 1, "request": [0, 2, 5, 6, 9, 10, 16, 20, 22, 24, 27, 28, 29, 30, 38, 39, 52, 56, 70, 71, 72, 73, 74, 75, 77, 79, 80, 81, 82, 88, 89, 93, 96, 97, 98, 99], "request_id": [36, 53, 70, 97], "request_stats_max_iter": 70, "request_timeout": 30, "request_typ": 70, "request_type_context_and_gener": [0, 2], "request_type_context_onli": [0, 2], "request_type_generation_onli": [0, 2], "requesterror": 70, "requestid": [0, 2, 3], "requestidtyp": 0, "requestlist": 99, "requestoutput": [36, 53, 70, 93], "requestperfmetr": 0, "requestschedul": 99, "requeststag": 0, "requeststat": 0, "requeststatsmaxiter": 0, "requeststatsperit": 0, "requeststatsperiter": 0, "requeststatsvec": 0, "requesttoken": 3, "requesttyp": [0, 1, 2, 70], "requesttypesdevic": 1, "requestvector": 1, "requir": [0, 2, 5, 6, 9, 10, 12, 16, 17, 19, 20, 21, 25, 26, 28, 29, 30, 45, 58, 65, 66, 67, 70, 73, 74, 75, 76, 77, 80, 82, 83, 88, 89, 91, 92, 93, 98], "require_ln_f": 84, "requiresattentionmask": 1, "rerun": 80, "rescale_output_factor": 83, "research": [5, 27, 32, 42, 46, 47, 49, 50, 90], "resembl": 51, "reserv": [0, 1, 30, 70, 81, 87, 89, 99], "reserved_block": 99, "reset": [0, 1, 6, 70, 73, 87], "resetspeculativedecodingmodul": 1, "reshap": [1, 82], "resid": [10, 59], "residu": [82, 92], "residual_connect": 83, "residual_mlp": 84, "residual_multipli": 84, "residual_rms_norm": 82, "residual_rms_norm_out_quant_fp8": 82, "residual_rms_norm_out_quant_nvfp4": 82, "residual_rms_norm_quant_fp8": 82, "residual_rms_norm_quant_nvfp4": 82, "residual_rms_prepost_norm": 82, "residualadd": [29, 80, 93], "resiz": 1, "resolv": [34, 61, 92], "resourc": [0, 2, 5, 19, 26, 28, 96, 98, 99], "respect": [4, 36, 81, 82, 87, 89, 90, 95, 99], "respons": [0, 2, 8, 30, 36, 60, 61, 62, 70, 73, 82, 96], "responsewithid": 0, "rest": [1, 5, 77], "restart": 0, "restrict": [0, 2, 3, 6, 65, 70, 82], "result": [0, 1, 4, 5, 11, 12, 16, 21, 22, 23, 25, 27, 28, 29, 36, 64, 65, 70, 73, 76, 77, 78, 79, 80, 82, 83, 93, 95, 97, 99], "retail": 73, "retain": [21, 23], "retent": [0, 70], "retentionprior": 0, "retentionpriorityanddur": 0, "rethink": 12, "retriev": [1, 17, 70, 74, 82], "return": [0, 1, 3, 7, 10, 12, 14, 16, 17, 19, 36, 70, 73, 79, 82, 83, 84, 87, 89, 92, 93, 98, 99], "return_all_generated_token": 87, "return_context_logit": 70, "return_dict": 87, "return_encoder_output": [70, 87], "return_generation_logit": 70, "return_perf_metr": 70, "returnallgeneratedtoken": [0, 3], "returncontextlogit": 0, "returnencoderoutput": 0, "returngenerationlogit": 0, "returnlogprob": 0, "returnperfmetr": 0, "reus": [0, 2, 3, 8, 27, 29, 64, 68, 70, 82, 87, 89, 93, 95, 98], "reusabl": [8, 9], "reusedblock": 0, "reusedblocksperrequest": 0, "reveal": [26, 28], "revers": 82, "revert": 82, "review": 73, "revis": 70, "revolution": 71, "rewind": [27, 93], "rewrit": [64, 82, 93, 95], "rewritepatternmanag": 7, "rewrt": 92, "rf": 92, "rg_lru": 82, "rgc": 73, "rh": [0, 1], "rich": 15, "right": [71, 77, 82, 92], "rigor": [51, 73], "risk": [2, 16, 77, 81], "rm": [65, 82, 91, 92, 95], "rms_norm": [26, 82, 95], "rmsnorm": [10, 26, 82, 83, 84, 93, 95], "rnn": [29, 93], "rnn_conv_dim_s": 87, "rnn_head_siz": 87, "rnn_hidden_s": 87, "rnn_state": 84, "rnnconfig": 1, "rnnconvdims": 1, "rnnheadsiz": 1, "rnnhiddens": 1, "ro": 20, "roberta": [91, 93], "robertaforquestionansw": 84, "robertaforsequenceclassif": 84, "robertamodel": 84, "robin": 2, "robust": [26, 93], "rock": 82, "role": [16, 30, 33, 34, 45, 60, 61, 79, 88], "roll": 64, "rooflin": 28, "root": [15, 20, 31, 65, 67, 69, 70, 75, 82, 88], "root_lay": 7, "rope": [26, 28, 82, 87, 93, 97], "rope_gpt_neox": [5, 82, 84], "rope_gptj": [5, 82], "rope_local_base_freq": 84, "rope_scaling_config": 82, "rope_scaling_long_factor": 83, "rope_scaling_long_mscal": 83, "rope_scaling_short_factor": 83, "rope_scaling_short_mscal": 83, "ropeembeddingutil": 82, "rotari": [0, 26, 82, 87, 95, 97], "rotary_bas": 84, "rotary_cos_sin": 82, "rotary_dim": 84, "rotary_embed": 95, "rotary_embedding_bas": [82, 83], "rotary_embedding_base_loc": 83, "rotary_embedding_beta_fast": 83, "rotary_embedding_beta_slow": 83, "rotary_embedding_dim": [5, 82, 84], "rotary_embedding_long_m_scal": 82, "rotary_embedding_max_posit": 82, "rotary_embedding_mscal": 83, "rotary_embedding_mscale_all_dim": 83, "rotary_embedding_origin_max_posit": 83, "rotary_embedding_original_max_posit": 82, "rotary_embedding_percentag": 83, "rotary_embedding_sc": 83, "rotary_embedding_scal": 82, "rotary_embedding_scale_typ": 82, "rotary_embedding_short_m_scal": 82, "rotary_inv_freq": [82, 83], "rotary_inv_freq_loc": 83, "rotary_pct": 84, "rotary_sc": [83, 84], "rotaryembed": 95, "rotaryembeddingdim": [0, 1], "rotaryscalingtyp": 82, "rotate_every_two": 82, "rotate_half": 82, "round": [2, 70, 82], "rout": [2, 28], "router": [4, 10, 28, 93], "router_gemm": 26, "routin": 7, "routingkernel": 26, "row": [10, 79, 82, 90, 93], "rowlinear": [10, 83], "rowwis": 70, "rr": 93, "rslora": 93, "rst": 3, "rtx": 93, "rubric": 82, "rule": [5, 76, 92], "run": [0, 1, 2, 3, 5, 6, 9, 11, 12, 14, 15, 16, 21, 25, 26, 28, 29, 30, 31, 32, 47, 48, 52, 55, 56, 57, 64, 65, 66, 67, 69, 70, 71, 76, 77, 79, 80, 81, 82, 84, 87, 89, 90, 92, 93, 95, 96, 97, 98], "run_dtm_pld": 12, "run_medusa_decod": 54, "runner": [0, 15, 87], "runningleon": 93, "runpod": 31, "runtim": [0, 3, 5, 12, 13, 18, 26, 27, 29, 30, 49, 52, 58, 64, 70, 71, 72, 73, 75, 78, 79, 82, 83, 84, 88, 92, 93, 95, 97, 99], "runtime_config": [36, 49], "runtime_default": 84, "runtime_error": 1, "runtime_rank": 87, "runtimedefault": [0, 84], "runtimedefaultsin": 84, "runtimeerror": [69, 70, 92], "runtimetensor": 87, "s0": 5, "s1": 5, "s2": 5, "sacrif": 26, "sad": 87, "saeyoonoh": 93, "safe": [1, 7, 28, 80], "safer": 82, "safetensor": [15, 17, 92, 93], "sage_attn": 82, "sage_attn_k_block_s": 82, "sage_attn_k_quant_s": 82, "sage_attn_q_block_s": 82, "sage_attn_q_quant_s": 82, "sage_attn_v_block_s": 82, "sage_attn_v_quant_s": 82, "sageattent": 82, "sai": [72, 75, 79], "said": 77, "sake": 79, "sale": 73, "same": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 19, 22, 27, 28, 29, 52, 55, 56, 57, 65, 69, 70, 73, 74, 77, 80, 81, 82, 83, 85, 87, 89, 93], "sampl": [0, 1, 3, 5, 16, 18, 20, 26, 27, 43, 44, 46, 47, 48, 49, 50, 51, 52, 54, 58, 59, 64, 68, 70, 72, 73, 74, 82, 83, 87, 93], "sample_proj_bia": 83, "sample_weight_strip": 93, "samplemod": 82, "sampler": 70, "sampling_config": 87, "sampling_param": [36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 59, 66, 67, 70, 75, 81, 88, 93, 94], "samplingconfig": [0, 3, 6, 36, 87, 93], "samplingparam": [36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 59, 66, 67, 70, 75, 81, 88, 93, 94], "saniti": [66, 67, 76, 77, 80], "santacod": [69, 90, 91], "satfinit": 90, "satisfi": [6, 17, 93], "save": [5, 9, 12, 19, 20, 27, 28, 29, 31, 46, 49, 69, 70, 72, 73, 77, 80, 81, 89, 93], "save_checkpoint": [19, 84], "save_config": [19, 84], "saw": [77, 88], "sbatch": [16, 55, 56, 57], "sbsa": [93, 94], "scaffold": [93, 95], "scalar": [6, 11, 82], "scalartyp": 93, "scale": [0, 6, 10, 17, 28, 29, 70, 77, 82, 83, 90, 93], "scale_d0": 82, "scale_d1": 82, "scale_factor": 82, "scale_output": 82, "scale_qk": 83, "scale_typ": 82, "scalia": [42, 46, 47, 49, 50], "scaling_factor": 82, "scaling_long_factor": 82, "scaling_short_factor": 82, "scalingvecpoint": 1, "scanreducetempstorag": 1, "scanreducetempstoragebyt": 1, "scantempstorag": 1, "scantempstoragebyt": 1, "scatter": [7, 82], "scatter_nd": 82, "scenario": [2, 5, 11, 12, 15, 20, 23, 25, 26, 28, 29, 32, 73, 74, 75, 77, 79, 80, 93], "scfg": 87, "schedul": [0, 2, 3, 9, 10, 20, 27, 28, 29, 30, 51, 70, 73, 75, 80, 89, 93, 94], "schedule_request": 99, "scheduled_request": 99, "scheduler_config": [70, 81], "schedulerconfig": [0, 70, 81, 93], "schedulerpolici": 93, "schema": [0, 3, 45, 70, 73], "scheme": 0, "scicod": 26, "scienc": [42, 46, 47, 49, 50, 52], "scope": [18, 27, 93], "score": [6, 28], "scout": 91, "scratch": [73, 75, 76, 80], "script": [10, 14, 16, 19, 20, 31, 55, 56, 57, 65, 69, 72, 73, 74, 75, 85, 90, 92, 93, 94, 95], "sd3": 83, "sd35adalayernormzerox": 83, "sd3patchemb": 83, "sd3transformer2dmodel": 84, "sd3transformer2dmodelconfig": 84, "sdxl": 93, "seamless": 93, "search": [0, 1, 3, 6, 12, 18, 24, 29, 30, 36, 49, 64, 70, 77, 79, 82, 93, 96], "seashor": [34, 61], "seat": [42, 46, 47, 49, 50], "sec": [20, 22, 73, 74, 75, 77, 79, 80], "second": [1, 3, 6, 9, 10, 12, 20, 21, 23, 24, 26, 70, 79, 82], "secondari": [0, 8, 70, 89], "secondary_offload_min_prior": 70, "secondaryoffloadminprior": 0, "secondli": 79, "section": [3, 6, 16, 17, 19, 20, 27, 28, 30, 65, 69, 71, 73, 75, 77, 78, 79, 80, 82, 88, 91, 93, 97], "section_s": 82, "secur": [45, 93], "securityprotocol": 45, "see": [0, 1, 5, 6, 8, 12, 16, 17, 20, 21, 23, 24, 25, 27, 28, 30, 31, 32, 34, 40, 61, 67, 73, 74, 75, 77, 79, 80, 81, 82, 83, 84, 89, 90, 92, 93, 98], "seed": [0, 6, 30, 38, 39, 70, 86, 93], "seem": [9, 51, 58, 73, 76], "seen": [12, 20, 73], "segment": 93, "select": [0, 4, 6, 18, 25, 26, 28, 29, 73, 80, 82, 87, 89, 96, 99], "selectcontextid": 0, "selectgenidx": 0, "selective_scan": 82, "self": [0, 5, 7, 14, 16, 17, 52, 70, 73, 82, 84, 87, 92, 95, 98, 99], "self_attent": 17, "self_attention_mask": 83, "self_attention_packed_mask": 83, "self_attn": [17, 95], "selfidx": 0, "sell": 73, "semicolon": 65, "senat": [42, 46, 47, 49, 50], "send": [0, 2, 16, 26, 30, 75, 76, 82, 88, 93], "sens": 77, "sensit": [26, 77], "sent": [0, 12, 28, 30, 70], "sentenc": [0, 6, 70, 88], "separ": [11, 12, 29, 54, 65, 73, 82, 87, 97], "separate_match_rewrit": 7, "seq": [1, 5, 73, 82], "seq_idx": 87, "seq_len": [74, 82, 83, 97], "seq_length": 82, "seq_lens_cuda": 97, "seqlen": [0, 82], "seqslot": 1, "sequenc": [0, 1, 3, 5, 6, 7, 8, 9, 12, 16, 20, 21, 22, 23, 24, 26, 27, 28, 70, 71, 73, 74, 75, 78, 81, 82, 83, 87, 89, 93, 97, 98], "sequence_length": [82, 83, 87, 92], "sequence_length_buff": 87, "sequence_limit_length": 87, "sequenceindex": [0, 3], "sequencelengthscba": 1, "sequencelimitlength": 1, "sequenti": [0, 2, 12, 27, 89], "seri": 93, "serial": [29, 82, 84, 87], "serializ": 70, "serialize_engin": 87, "serializeds": 0, "serializedst": 0, "serv": [0, 2, 3, 5, 8, 12, 16, 18, 24, 25, 33, 34, 35, 37, 38, 39, 40, 41, 60, 61, 62, 64, 70, 80, 93, 96, 97], "server": [0, 9, 12, 16, 18, 22, 31, 33, 34, 35, 37, 38, 39, 60, 61, 62, 64, 93], "server_start_timeout": 30, "servic": [18, 59, 64], "session": [5, 69, 73, 87], "set": [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 15, 17, 18, 19, 20, 26, 28, 29, 30, 36, 45, 55, 56, 57, 65, 67, 70, 71, 72, 74, 75, 77, 79, 80, 81, 82, 83, 84, 85, 87, 88, 89, 92, 93, 99], "set_attn_processor": 84, "set_from_opt": 1, "set_if_not_exist": 84, "set_input_shap": 87, "set_rank": 84, "set_rel_attn_t": 83, "set_shap": 87, "setadditionalmodeloutput": [0, 3], "setallottedtimem": 0, "setbackend": 0, "setbadword": 0, "setbatchingtyp": 0, "setbeamsearchdiversityr": 0, "setbeamwidth": 0, "setbeamwidtharrai": 0, "setbitto": 0, "setcachest": 0, "setcachetransceiverconfig": 0, "setclientid": 0, "setcommst": 0, "setcommunicationmod": 0, "setcommunicationtyp": 0, "setcontextfmha": 1, "setcontextphaseparam": [0, 2], "setcopyonpartialreus": 0, "setcrossattentionmask": 0, "setcrosskvcachefract": 0, "setcudagraphcaches": 0, "setcudagraphmod": 0, "setdatatyp": 1, "setdebugconfig": 0, "setdebuginputtensor": 0, "setdebugoutputtensor": 0, "setdebugtensornam": 0, "setdebugtensorsmaxiter": 0, "setdecodingconfig": 0, "setdecodingmod": 0, "setdeviceid": 0, "seteagleconfig": 0, "seteagleinput": 1, "setearlystop": 0, "setembeddingbia": 0, "setenableblockreus": 0, "setenablechunkedcontext": 0, "setenablecontextfmhafp32acc": 0, "setenablepartialreus": 0, "setenabletrtoverlap": 0, "setencodedvocab": 0, "setencoderhiddens": 1, "setencoderinputfeatur": 0, "setencoderinputtokenid": 0, "setencoderoutputlength": 0, "setendid": 0, "seteventbuffermaxs": 0, "setexecutionconfig": 1, "setexplicitdrafttokensinput": 1, "setextendedruntimeperfknobconfig": 0, "setexternaldrafttokensconfig": 0, "setfreegpumemoryfract": 0, "setfrequencypenalti": 0, "setfrom": 0, "setfrominput": 1, "setgathergenerationlogit": 0, "setgemmallreducedtyp": 1, "setgpuweightsperc": [0, 13], "setguideddecodingconfig": 0, "setguideddecodingparam": 0, "sethostcaches": 0, "setinittozero": 1, "setisorchestr": 0, "setiterstatsmaxiter": 0, "setkvcacheconfig": 0, "setkvcacheretentionconfig": 0, "setkvcachetyp": 1, "setlanguageadapteruid": 0, "setlayertyp": 1, "setlengthpenalti": 0, "setlevel": 1, "setlogitsdtyp": 1, "setlogitspostprocessor": 0, "setlogitspostprocessorconfig": 0, "setlogitspostprocessornam": 0, "setlookaheadconfig": 0, "setlookaheaddecodingconfig": 0, "setloraconfig": 0, "setloramodul": 1, "setmanagedweightsmap": 1, "setmanageweightstyp": 1, "setmaxattentionwindowvec": 0, "setmaxbatchs": [0, 1], "setmaxbeamwidth": [0, 1], "setmaxdraftpathlen": 1, "setmaxdrafttoken": 1, "setmaxencoderlen": 1, "setmaxinputlen": 1, "setmaxlorarank": 1, "setmaxnumpath": 1, "setmaxnumtoken": [0, 1], "setmaxpagesperblock": 1, "setmaxpositionembed": 1, "setmaxpromptembeddingtables": 1, "setmaxqueues": 0, "setmaxseqidlemicrosecond": 0, "setmaxsequencelen": 1, "setmaxtoken": 0, "setmedusachoic": 0, "setmem": 1, "setmemorytyp": 1, "setminp": 0, "setmintoken": 0, "setmlphiddens": 1, "setmodelnam": 1, "setmodelvari": 1, "setmropeconfig": 0, "setmultiblockmod": 0, "setmultimodalembed": 0, "setnbcrosskvhead": 1, "setnbkvhead": 1, "setnorepeatngrams": 0, "setnormalizelogprob": 0, "setnumcopystream": 1, "setnumdecodingenginetoken": 1, "setnumkvheadspercrosslay": 1, "setnumkvheadsperlay": 1, "setnumlanguag": 1, "setnumnod": 0, "setnumreturnsequ": 0, "setonboardblock": 0, "setorchestratorconfig": 0, "setorchleadercomm": 0, "setoutputconfig": 0, "setpadid": 0, "setpagedcontextfmha": 1, "setpagewidth": 1, "setparallelconfig": 0, "setparticipantid": 0, "setpath": 1, "setpeftcacheconfig": 0, "setpositionid": 0, "setppreducescatt": 1, "setpresencepenalti": 0, "setprior": 0, "setprocessorbatch": 0, "setprocessormap": 0, "setprompttableoffload": 0, "setprompttuningconfig": 0, "setquantmod": 1, "setrecvpollperiodm": 0, "setrepetitionpenalti": 0, "setrepl": [0, 3], "setrequeststatsmaxiter": 0, "setrequesttyp": [0, 2], "setreturnallgeneratedtoken": 0, "setrnnconfig": 1, "setrotaryembeddingdim": 1, "setsamplingconfig": 0, "setschedulerconfig": 0, "setse": 0, "setsecondaryoffloadminprior": 0, "setsinktokenlength": 0, "setsizeperhead": 1, "setskipcrossattnblock": [0, 1], "setslotsperpag": 1, "setspawnprocess": 0, "setspecdecconfig": 0, "setspeculativedecodingmod": 1, "setspeculativedecodingmodul": 1, "setstoptokenid": 0, "setstopword": 0, "setstream": 0, "settemperatur": 0, "setter": [0, 6], "settokenizerstr": 0, "settokensperblock": 1, "settopk": 0, "settopp": 0, "settoppdecai": 0, "settoppmin": 0, "settoppresetid": 0, "settotalnumpag": 1, "setup": [1, 5, 29, 45, 55, 56, 57, 67, 76, 77, 87, 88, 89, 93], "setup_fake_prompt": 87, "setup_fake_prompts_qwen2vl": 87, "setup_fake_prompts_vila": 87, "setup_input": 87, "setupeagl": 1, "setupexplicitdrafttoken": 1, "setuplookahead": 1, "setupspeculativedecod": 1, "setuptool": [66, 67], "setusecrossattent": 1, "setusegpudirectstorag": 0, "setusemrop": 1, "setusepositionembed": 1, "setuseshapeinfer": 1, "setusetokentypeembed": 1, "setworkerexecutablepath": 0, "setzero": [0, 1], "seve": 70, "sever": [0, 1, 2, 5, 7, 12, 15, 27, 36, 77, 78, 79, 80, 82, 89, 92, 97], "sft": 58, "sh": [16, 31, 93, 94], "shah": 93, "shaken": 51, "shall": [19, 89], "shape": [0, 1, 5, 7, 10, 15, 16, 26, 28, 70, 80, 82, 84, 87, 89, 90, 92, 93, 97, 98], "shape_cast_dtyp": 82, "shapeequ": 1, "shard": [17, 26, 64, 73, 78, 82, 83], "shard_map": 17, "sharding_along_vocab": 70, "sharding_dim": [82, 83], "share": [1, 2, 3, 5, 7, 8, 9, 10, 12, 19, 20, 25, 26, 27, 28, 29, 65, 76, 77, 82, 83, 93], "share_embed": 93, "share_weight": 83, "shared_embedding_t": 93, "shared_fc1": 28, "shared_fc2": 28, "shared_ptr": [0, 1], "sharedconstptr": 1, "sharedptr": 1, "shelf": 93, "sherlock113": 93, "shift": [11, 27], "ship": [19, 51], "shm": 92, "short": [5, 73, 77, 79], "short_mscal": [82, 83], "shorter": [5, 74], "shot": 93, "should": [0, 1, 2, 3, 7, 9, 10, 11, 19, 20, 28, 36, 42, 45, 46, 47, 49, 50, 52, 53, 55, 56, 57, 58, 65, 70, 73, 74, 75, 76, 80, 81, 82, 83, 85, 87, 89, 93, 95, 97, 98, 99], "should_stop": 87, "shouldus": 5, "show": [2, 3, 16, 22, 26, 27, 28, 30, 40, 74, 75, 79, 80, 88, 89, 91, 94], "showcas": [77, 80, 88], "shown": [11, 23, 27, 30, 65, 69, 73, 75, 77, 79, 80, 82], "shrunk": 82, "shuffl": 82, "shut": 2, "shutdown": [0, 59, 69, 70], "si": 5, "sibl": 16, "side": [3, 82], "side_stream_id": 82, "sidestreamidtyp": 82, "sigh": 58, "sigmoid": [16, 82], "signal": 0, "signatur": [7, 52, 82], "signifi": 79, "signific": [3, 5, 8, 23, 27, 28, 58, 76, 77, 79, 80], "significantli": [25, 26, 27, 28, 75, 76, 77, 79, 80, 89, 97], "silicon": 28, "silu": [16, 82, 83], "similar": [0, 5, 6, 7, 12, 20, 21, 23, 27, 36, 49, 53, 72, 73, 81, 82, 96, 99], "similarli": 12, "simpl": [2, 7, 8, 12, 16, 40, 52, 65, 69, 71, 74, 88, 94], "simpler": 12, "simpleschedul": 99, "simplest": 82, "simpli": [5, 12, 71, 73, 74, 79, 88, 92, 95], "simplic": 19, "simplifi": [5, 19, 73, 79, 82, 93], "simultan": [12, 79], "sin": [0, 82, 83], "sinc": [0, 1, 4, 5, 7, 9, 12, 13, 19, 20, 27, 28, 31, 36, 65, 70, 73, 75, 76, 77, 79, 80, 82, 84, 89, 96, 98, 99], "sincer": 28, "sinco": 83, "singl": [0, 1, 2, 3, 4, 5, 6, 8, 12, 14, 16, 19, 20, 23, 24, 26, 27, 28, 29, 34, 52, 61, 69, 70, 72, 73, 77, 80, 82, 84, 88, 89, 90, 93, 95, 96, 97, 98], "singleton": [7, 82], "sink": [0, 1, 5, 70, 87], "sink_token_len": 87, "sink_token_length": [5, 70, 87], "sinktokenlength": [0, 1], "sinusoid": 83, "sit": [19, 58], "situaiton": 74, "situat": [12, 58, 64, 75, 79], "six": 27, "size": [0, 1, 2, 5, 6, 8, 9, 10, 11, 12, 13, 20, 22, 23, 25, 26, 27, 28, 29, 30, 36, 52, 55, 56, 57, 64, 70, 72, 73, 74, 75, 76, 77, 78, 80, 82, 83, 84, 87, 92, 93, 97, 99], "size_t": [0, 1], "size_typ": [0, 1], "sizeof": 1, "sizeperhead": [0, 1], "sizetype32": [0, 1], "sizetype64": [0, 1], "skip": [0, 1, 7, 17, 20, 32, 59, 65, 70, 82, 99], "skip_attn": [82, 83], "skip_cross_attn_block": [84, 87], "skip_cross_kv": [83, 87], "skip_encod": 87, "skip_special_token": [70, 93], "skip_tokenizer_init": [36, 70], "skipcrossattnblock": [0, 1], "sku": [75, 77, 79, 80], "skywork": [90, 91, 93], "sleep": 32, "slice": [1, 4, 17, 82, 93], "slice_shap": 17, "sliceinputtyp": 82, "slicen": 1, "slide": [8, 64, 81, 82, 87, 93], "slider": [20, 26, 73], "sliding_window": 84, "sliding_window_caus": 82, "sliding_window_pattern": 84, "slight": [20, 27, 28, 77, 79, 80], "slightli": [0, 2, 10, 11, 30, 77, 80], "slope": [5, 82], "slot": [0, 1, 93], "slot_map": [82, 84], "slotidx": 1, "slotsperpag": 1, "slow": [3, 9, 70, 71, 76], "slower": [8, 19, 28, 76], "slowest": 5, "slurm": [16, 55, 56, 57, 67, 69, 92, 93], "sm": [91, 93], "sm120": 93, "sm80": [91, 93], "sm86": [91, 93], "sm89": [91, 93], "sm90": [91, 93], "small": [5, 9, 11, 12, 16, 25, 26, 27, 28, 75, 77, 79, 80, 82, 89, 92, 93], "smaller": [1, 12, 20, 27, 29, 72, 73, 76, 79, 80, 81, 82, 89, 93], "smallest": [0, 1, 8, 82], "smart": 82, "smaug": [91, 93], "smi": [20, 26, 73, 89], "smile": 58, "smith": [42, 46, 47, 48, 49, 50, 52, 59], "smooth": [19, 70, 93], "smoother": 20, "smoothquant": [7, 25, 64, 93], "smoothquant_v": 70, "snapshot": 73, "snapshot_download": 58, "snip": 73, "snippet": [73, 93, 99], "snshrivas10": 58, "so": [0, 2, 3, 5, 7, 10, 12, 18, 19, 20, 26, 27, 28, 31, 36, 49, 65, 70, 73, 76, 77, 79, 80, 81, 82, 83, 84, 89, 91, 93, 95, 98], "socketst": 0, "softmax": [5, 16, 27, 28, 82, 97], "softplu": 82, "softwar": [3, 5, 16, 28, 64, 71, 93], "solid": 78, "solut": [18, 69, 92, 96], "some": [0, 2, 3, 4, 5, 6, 7, 9, 12, 13, 15, 16, 19, 20, 26, 27, 28, 29, 30, 32, 58, 67, 70, 71, 74, 77, 78, 80, 81, 82, 85, 88, 89, 92, 93, 95, 96, 99], "someth": [16, 36, 51], "sometim": 73, "song": 73, "soon": [0, 21, 22, 23, 24, 25, 27, 36], "sophist": 52, "sora": [34, 61], "sort": [0, 1, 3, 6, 82], "sota": 93, "sourc": [14, 15, 17, 19, 20, 21, 24, 26, 28, 29, 30, 33, 34, 35, 37, 38, 39, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 64, 70, 71, 82, 83, 84, 85, 86, 87, 93], "source_root": [55, 56, 57], "sourcetaskvalu": 1, "soyer": [14, 16, 92], "space": [10, 65, 70, 79, 89, 98], "spaces_between_special_token": [70, 93], "span": [19, 26, 27], "spars": [12, 28, 82, 93], "sparse_fc1": 28, "sparse_fc2": 28, "sparsiti": 29, "spatial_norm_dim": 83, "spawn": [40, 50, 66, 67, 69, 75, 88, 92], "spawnprocess": [0, 2], "spec": 29, "spec_decode_algo": 27, "spec_decode_nextn": 27, "spec_decoding_generation_length": [82, 83, 84], "spec_decoding_is_generation_length_vari": [82, 83, 84], "spec_decoding_max_generation_length": [82, 83], "spec_decoding_packed_mask": [82, 83, 84], "spec_decoding_param": [83, 84], "spec_decoding_position_offset": [82, 83, 84], "spec_decoding_us": [82, 83], "specdec": 0, "specdecconfig": 0, "specdecfastlogitsinfo": 0, "specdecodinggenerationlength": 1, "specdecodinggenerationlengthshost": 1, "specdecodingpackedmask": 1, "specdecodingparam": 83, "specdecodingpositionoffset": 1, "specdecodingstat": 0, "specdecstat": 0, "special": [2, 5, 10, 16, 17, 21, 27, 29, 70, 93], "specif": [0, 1, 4, 6, 7, 8, 10, 11, 12, 15, 19, 22, 25, 26, 28, 30, 52, 65, 67, 73, 76, 77, 80, 82, 88, 93, 95, 96], "specifi": [0, 1, 2, 3, 5, 6, 7, 8, 10, 12, 17, 19, 20, 29, 30, 36, 43, 44, 45, 52, 54, 58, 59, 65, 69, 70, 72, 73, 74, 76, 77, 79, 81, 82, 84, 85, 87, 88, 89, 92, 93, 97], "specul": [0, 1, 3, 26, 64, 68, 70, 73, 75, 82, 93], "speculative_config": [20, 26, 27, 43, 44, 53, 54, 70], "speculative_decod": 93, "speculative_decoding_draft_tokens_extern": 84, "speculative_decoding_mod": [29, 70, 73], "speculative_model": [43, 44, 54, 70], "speculativedecod": 0, "speculativedecodingconfig": 0, "speculativedecodingfastlogitsinfo": 0, "speculativedecodingmetr": 0, "speculativedecodingmod": [70, 84, 93], "speculativedecodingmodul": 93, "speculativedecodingoutput": 1, "speed": [16, 22, 26, 27, 28, 29, 73, 74, 80, 93], "speedup": [22, 24, 25, 26, 28], "spent": 0, "split": [1, 4, 5, 10, 16, 70, 73, 76, 77, 82, 89, 93], "split_input_id": 87, "split_prompt_by_imag": 87, "split_siz": 82, "split_size_or_sect": 82, "splittransposecpu": 1, "splittransposecpuinn": 1, "splitwis": 2, "spot": 79, "sq": [25, 90, 93], "sqrt": [5, 82], "squar": [79, 82], "squared_relu": 82, "squeez": [1, 82, 87], "src": [1, 16, 82], "src_seq_len": 82, "srcdesc": 0, "srctype": 1, "srun": [16, 30, 55, 56, 57, 67, 92], "sshd": 31, "ssid": 45, "ssm": 82, "ssm_state": 84, "stabil": 26, "stabl": [5, 17, 29, 75, 79, 80, 82, 93], "stack": [17, 26, 65, 82], "stage": [0, 5, 7, 12, 27, 74, 89, 93, 97], "stai": [22, 25, 76, 80], "stand": 16, "standalon": 19, "standard": [12, 16, 18, 21, 74, 82], "starcod": [69, 91, 93], "starcoder1": 90, "starcoder2": [90, 93], "starrickliu": 93, "start": [0, 3, 5, 7, 9, 20, 27, 29, 31, 32, 33, 34, 35, 37, 38, 39, 57, 58, 60, 61, 62, 65, 69, 70, 71, 73, 74, 75, 76, 79, 81, 82, 84, 86, 87, 89, 93], "start_dim": 82, "startup": 92, "stat": [0, 70, 93], "state": [0, 1, 3, 4, 5, 7, 8, 9, 12, 20, 26, 27, 29, 40, 42, 43, 44, 46, 47, 48, 49, 50, 52, 54, 59, 66, 67, 70, 73, 74, 75, 79, 81, 82, 88, 93, 94, 99], "state_dtyp": 87, "state_or_ptr": 82, "state_s": 87, "statement": 69, "stateptr": 0, "states": 1, "static": [0, 1, 3, 12, 28, 29, 70, 82, 83, 84, 87, 93], "static_batch": [70, 81], "static_cast": 90, "staticbatchingstat": 0, "statist": [0, 3, 12, 30, 70, 73, 93], "statu": 92, "std": [0, 1, 3], "stddev": [30, 38, 39], "stdev": [20, 56, 72, 73, 74, 75], "stdit": 93, "stdout": [20, 56, 72, 73, 74, 75], "steadi": 74, "steady_clock": 0, "step": [0, 1, 5, 6, 7, 9, 12, 15, 16, 18, 19, 21, 26, 27, 32, 52, 64, 66, 67, 70, 71, 73, 74, 75, 82, 87, 92, 96, 97, 98, 99], "still": [5, 17, 19, 20, 26, 27, 28, 71, 73, 75, 77, 82, 87, 89, 93], "stop": [0, 1, 3, 6, 7, 12, 70, 73, 79, 87, 88, 93], "stop_reason": [53, 70, 88, 93], "stop_token_id": [3, 70], "stop_words_data": 87, "stop_words_list": 87, "stopping_criteria": 87, "stoppingcriteria": [87, 93], "stoppingcriterialist": 87, "stoptokenid": [0, 3], "stopword": 0, "stopwordslen": 1, "stopwordslist": 1, "stopwordsptr": 1, "storag": [0, 8, 10, 70], "store": [0, 1, 5, 8, 9, 10, 16, 22, 26, 51, 54, 69, 70, 73, 81, 82, 84, 89, 90, 95, 97, 98], "store_tru": 54, "stored_block": 51, "stori": 58, "str": [15, 19, 47, 48, 70, 82, 83, 84, 87], "straightforward": 27, "strategi": [0, 11, 12, 25, 27, 36, 49, 64, 70, 73, 78, 82, 84, 89, 93], "stream": [0, 1, 2, 3, 16, 28, 29, 30, 36, 38, 39, 40, 41, 52, 70, 72, 82, 87, 89, 92, 93], "stream_ptr": 52, "streaming_llm": 93, "streamingllm": [29, 64, 93], "streamlin": [73, 88], "streamptr": [0, 1, 3], "street": 58, "strenum": [70, 86], "strict": [26, 27], "strict_bound": 82, "strict_dtyp": [82, 83], "stricter": 26, "strictli": 73, "stride": [1, 82, 83], "strike": [12, 51], "string": [0, 1, 3, 15, 45, 70, 73, 82, 87], "string_valu": 9, "string_view": 1, "stringptrmap": 1, "stringvec": 0, "strip": [29, 93], "strip_plan": 29, "strongli": 77, "strongly_typ": [70, 93], "struct": [0, 1, 8], "structur": [0, 4, 7, 8, 12, 28, 52, 70, 82, 89, 93], "structural_tag": 70, "struggl": 58, "student": [42, 46, 47, 49, 50, 52], "studi": [28, 75, 77, 78, 80], "style": [5, 12, 26, 93], "sub": [15, 19, 82], "subclass": [1, 19, 52, 95], "subcommad": 73, "subcommand": [74, 93], "subgraph": [7, 82], "subject": [2, 21, 23, 24, 25, 69, 82, 88, 94], "submiss": 73, "submit": [10, 70, 73], "submit_sync": 70, "submittransferrequest": 0, "submodul": [20, 65, 95], "suboptim": 16, "subscript": 82, "subsequ": [2, 9, 10, 12, 27, 75], "subset": [0, 3, 6, 16, 19, 27, 73, 82], "substanti": [9, 12, 26, 28], "subsystem": 93, "subtract": 7, "succe": [89, 93], "succeed": 87, "success": [3, 22, 26, 74], "successfulli": [12, 32, 77], "sudo": [20, 26, 66, 67, 73], "suffer": 26, "suffici": [76, 77], "suggest": [5, 25, 58, 77], "suit": [5, 73, 74], "sum": [1, 7, 14, 82, 98], "sum_of_token": 82, "summar": [5, 12, 13, 14, 15, 23, 25, 73, 74, 81, 89], "summari": [8, 12, 64], "summat": 82, "sunjiabin17": 93, "super": [7, 14, 17, 19, 91, 92, 95, 99], "superchip": 91, "supplementari": 83, "suppli": [10, 18], "support": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 15, 18, 19, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 36, 45, 52, 55, 56, 57, 58, 64, 67, 68, 70, 74, 75, 77, 79, 80, 81, 82, 83, 85, 88, 92, 93, 94, 95, 96, 97, 98, 99], "supportsinflightbatch": 1, "suppos": 95, "suprem": [42, 46, 47, 49, 50], "sure": [2, 19, 20, 27, 32, 65, 73, 81, 82, 93], "surpass": 5, "surround": [5, 93], "swa": 8, "swap": 8, "sweep": [16, 22, 79], "sweet": 79, "swept": 23, "swiglu": [29, 82, 93], "switch": [4, 9, 11, 22, 25, 26, 28, 65, 81, 89, 93], "sxm": [22, 29, 75, 77, 78], "sy": 93, "symbol": 0, "sync": 87, "synchron": [1, 3, 16, 70, 92, 93], "syncmessag": 0, "syntax": [82, 88], "synthet": [20, 30, 38, 39, 73, 74], "synthetic_128_128": 73, "synthetic_2048_2048": 75, "synthetic_2048_2048_1000": 75, "system": [8, 9, 16, 20, 22, 27, 28, 30, 33, 34, 45, 55, 56, 57, 60, 61, 64, 65, 67, 74, 76, 88, 91, 93, 94], "systemat": 26, "t": [0, 1, 5, 12, 16, 19, 26, 28, 30, 31, 36, 51, 55, 56, 57, 67, 70, 72, 73, 76, 79, 80, 82, 84, 87, 92], "t5": [5, 6, 90, 91, 93], "t_": 27, "t_2": 27, "t_5": 27, "tabl": [0, 6, 9, 22, 25, 29, 73, 74, 82, 83, 87, 91, 92, 93], "tackl": 28, "tactic": [28, 29], "tag": [0, 31, 65, 70], "tailor": [25, 77, 80], "take": [0, 1, 2, 5, 6, 7, 9, 11, 15, 19, 27, 51, 58, 71, 73, 75, 76, 79, 82, 83, 98], "taken": [17, 21, 22, 82], "talk": 58, "tanh": [82, 83], "target": [0, 17, 20, 28, 29, 36, 64, 65, 73, 80, 81, 93], "target_isl": 73, "target_osl": 73, "targetcach": 1, "targetpageid": 1, "targetprob": 1, "targettaskvalu": 1, "tarot": 58, "task": [0, 1, 9, 10, 12, 14, 15, 47, 48, 55, 56, 57, 70, 73, 83, 87, 90, 93, 98], "task_id": [10, 73], "task_vocab_s": 83, "taskid": [0, 1], "taskidtyp": 1, "tasklayermoduleconfig": 1, "tasklayermoduleconfigbind": 1, "tasklayermoduleconfiglistptr": 1, "taskshost": 1, "taskvalu": 1, "taskvalueptr": 1, "taslid": 1, "tayef": 93, "tconstptr": 1, "tcp": 32, "team": [15, 19, 20, 26, 27, 28, 32, 91, 93], "tech": [27, 93], "technic": [8, 27, 28, 64], "techniqu": [5, 7, 12, 16, 21, 26, 27, 28, 71, 76, 77, 78, 81, 90, 93], "technologi": [26, 42, 46, 47, 49, 50, 52], "tekit_2025": 73, "tell": [34, 58, 59, 61, 80, 88], "temb": 83, "temp": 87, "temperatur": [0, 1, 6, 30, 33, 34, 35, 36, 40, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 54, 59, 66, 67, 70, 73, 75, 81, 87, 88, 93], "tempfil": [46, 49], "templat": [0, 1, 16, 17], "tempor": 87, "temporari": 2, "ten": [12, 25, 27], "tend": 81, "tensor": [1, 6, 11, 15, 16, 17, 20, 21, 22, 23, 24, 26, 27, 28, 30, 50, 52, 64, 70, 73, 74, 77, 78, 80, 82, 83, 84, 87, 90, 92, 93, 95, 97], "tensor_dict": 87, "tensor_input": 7, "tensor_parallel_s": [50, 51, 54, 55, 56, 57, 70, 75, 76, 77, 80, 81], "tensor_shap": 17, "tensorconstptr": 1, "tensorinfo": 87, "tensorloc": 82, "tensormap": 1, "tensorparallel": [0, 1, 6], "tensorptr": [0, 1], "tensorrt": [1, 3, 5, 6, 7, 8, 11, 13, 14, 21, 24, 26, 28, 29, 30, 33, 34, 35, 36, 37, 38, 39, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 66, 67, 68, 72, 74, 77, 78, 80, 81, 82, 87, 90, 92, 94, 95, 96, 97, 98, 99], "tensorrt_llm": [0, 1, 2, 3, 5, 6, 7, 10, 13, 14, 16, 17, 19, 20, 30, 31, 32, 36, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 65, 66, 67, 70, 73, 74, 75, 77, 80, 81, 82, 83, 84, 85, 86, 87, 88, 92, 93, 94, 95, 96, 97, 98], "tensorrt_llm_gpt": 16, "tensorrt_llm_rouge1_threshold": 15, "tensorrtllm_backend": [10, 88, 93], "term": [16, 69, 81, 82, 88], "termin": [0, 9, 32, 74, 93], "test": [5, 25, 26, 27, 30, 34, 61, 64, 65, 66, 67, 73, 74, 75, 77, 78, 79, 80, 81, 91, 93, 98], "test_graph_rewrit": 7, "test_trt_llm": [13, 14, 15], "texec": 0, "text": [0, 3, 5, 6, 9, 29, 34, 36, 40, 41, 42, 50, 51, 59, 61, 66, 67, 70, 71, 73, 74, 75, 81, 87, 88, 91, 92, 93, 94], "text_diff": 70, "text_hidden_s": 84, "textattack": 91, "textprompt": 70, "tg_group": 82, "tgt": [16, 82], "tgt_len": [82, 83], "tgt_seq_len": 82, "th": [1, 15, 27, 82], "than": [0, 1, 2, 3, 5, 6, 7, 9, 12, 16, 20, 21, 22, 23, 25, 26, 27, 28, 29, 65, 70, 71, 73, 74, 75, 76, 77, 79, 81, 82, 87, 89, 92, 93, 97], "thank": [27, 93], "thecodewrangl": 93, "thei": [0, 1, 3, 5, 6, 10, 16, 17, 19, 26, 27, 28, 53, 65, 70, 73, 75, 77, 79, 80, 81, 82, 84, 90, 93], "them": [0, 3, 4, 7, 12, 13, 20, 26, 27, 28, 55, 56, 57, 70, 71, 72, 73, 76, 78, 79, 81, 82, 87, 89, 95], "theoret": 89, "theori": 81, "therebi": [2, 81], "therefor": [13, 19, 74, 82, 92, 98], "thermal": 73, "theta": 82, "thi": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 36, 40, 45, 52, 54, 55, 56, 57, 58, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 87, 88, 89, 90, 92, 93, 94, 95, 96, 97, 98, 99], "thin": 19, "thing": [6, 32, 42, 46, 47, 49, 50, 52, 79, 80], "think": [26, 27, 28, 51, 78], "third": [3, 93], "those": [3, 5, 6, 15, 16, 18, 20, 26, 27, 28, 29, 30, 72, 74, 75, 80, 82, 83, 90], "though": [19, 27, 79, 89], "thread": [0, 1, 5, 11, 36, 69, 73, 87], "three": [2, 3, 15, 25, 26, 28, 81, 82, 90, 95, 96, 97], "threshold": [0, 26, 27, 82, 87], "throttl": 73, "through": [0, 5, 6, 7, 11, 12, 16, 17, 18, 20, 26, 29, 30, 65, 71, 73, 75, 76, 77, 79, 80, 83, 88, 93], "throughout": [75, 78], "throughput": [0, 3, 5, 21, 22, 23, 27, 56, 64, 72, 77, 79, 80, 81, 93, 97], "throw": [0, 1], "thu": [9, 19, 20, 26, 28, 65, 82, 89], "thumb": [5, 76, 92], "ti": [5, 27], "tiiuae": 73, "tile": 28, "time": [0, 1, 2, 3, 5, 9, 10, 11, 12, 13, 16, 20, 23, 25, 26, 27, 28, 29, 42, 46, 47, 48, 49, 50, 58, 64, 65, 70, 71, 72, 73, 74, 75, 77, 78, 79, 81, 82, 87, 92, 93, 98], "time_embed_dim": 83, "time_encod": 87, "time_point": 0, "timedelta": 70, "timedout": 0, "timelin": 15, "timeout": [0, 30, 36, 70, 93], "timepoint": 0, "timestamp": 0, "timestep": [83, 84], "timestepembed": 83, "timingmetr": 0, "tini": 58, "tinyllama": [30, 33, 35, 38, 40, 42, 45, 46, 47, 48, 49, 50, 51, 52, 53, 58, 59, 60, 62, 66, 67, 69, 88, 94], "tip": 64, "titl": 45, "tle": 13, "tllm_checkpoint_16gpu_tp8_pp2": 76, "tllm_ckpt_dir": 14, "tllm_engine_dir": 14, "tllm_kei": [17, 83], "tllm_llmapi_build_cach": 93, "tllm_llmapi_enable_nvtx": 72, "tllm_log_level": 92, "tllm_nvtx_debug": 72, "tllm_override_layer_num": 93, "tllm_profile_record_gc": 72, "tllm_profile_start_stop": 72, "tllm_to_externel_key_dict": 17, "tllm_torch_profile_trac": 72, "tllm_trace_model_forward": 93, "tllm_weight": 17, "tllmruntim": [1, 6, 92], "tlntin": 93, "tmp": [10, 13, 56, 72, 73, 76], "tmp9so41y3r": 73, "tmpowsrb_f4": 73, "tmpxhdvasex": 73, "to_arrai": 82, "to_dict": [70, 84], "to_json_fil": 84, "to_layer_quant_config": 84, "to_legacy_set": 85, "to_str": [0, 1, 3], "to_trt": 84, "tobyt": 1, "todo": [1, 54, 82], "togeth": [3, 5, 6, 10, 16, 18, 21, 26, 27, 29, 87, 90, 93], "toggl": 72, "toi": 79, "toitensor": 0, "tojsonstr": 0, "tok": [21, 23, 24, 80], "token": [0, 1, 2, 3, 4, 5, 6, 8, 9, 12, 16, 20, 21, 24, 25, 26, 27, 28, 29, 30, 31, 38, 39, 45, 51, 52, 56, 64, 70, 72, 73, 74, 75, 77, 78, 80, 82, 83, 84, 87, 88, 89, 90, 93, 95, 96, 97], "token_drop": 83, "token_end": 70, "token_extra_id": 51, "token_id": [36, 51, 52, 53, 70], "token_ids_diff": 70, "token_range_retention_config": 70, "token_start": 70, "token_type_id": [84, 87], "tokenend": 0, "tokenextraid": 1, "tokenextraidtyp": 1, "tokenid": 1, "tokenidtyp": [0, 1], "tokenization_utils_bas": 70, "tokenizer_dir": [14, 16, 88, 92], "tokenizer_image_token": 87, "tokenizer_max_seq_length": [70, 77, 84, 86], "tokenizer_mod": 70, "tokenizer_revis": 70, "tokenizer_str": [0, 3], "tokenizerbas": 70, "tokenizerstr": [0, 3], "tokenlogprob": 70, "tokenrangeretentionconfig": [0, 70], "tokenrangeretentionprior": 0, "tokens_per_block": [8, 9, 29, 87, 93, 98], "tokensperblock": [0, 1, 6], "tokensperstep": 1, "tokensprompt": 70, "tokenstart": 0, "tokyo": [34, 61], "toler": 25, "tomodulenam": 1, "tomoduletyp": 1, "tonylek": 93, "too": [3, 5, 20, 28, 75, 79, 92], "took": 75, "tool": [2, 15, 20, 28, 64, 69, 73, 93], "tool_cal": 88, "toolkit": [18, 19, 25, 26, 67, 96], "top": [0, 5, 6, 12, 16, 18, 27, 28, 70, 82, 93], "top1": 26, "top_k": [6, 70, 87, 93], "top_p": [6, 40, 42, 43, 44, 46, 47, 48, 49, 50, 51, 52, 54, 59, 66, 67, 70, 75, 81, 87, 88], "top_p_decai": [70, 87], "top_p_min": [70, 87], "top_p_reset_id": [70, 87], "topenkoff": 93, "topic": 80, "topk": [0, 1, 4, 6, 12, 26, 28, 82, 93], "topk_logit": 3, "topklastdim": 82, "topklogit": 3, "topkmedusahead": 1, "topktopp": [0, 6], "topmodelmixin": [19, 84], "topn": 26, "topp": [0, 1, 6, 93], "toppdecai": [0, 1, 6], "toppmin": [0, 1, 6, 70], "toppresetid": [0, 1, 6], "torch": [5, 17, 52, 59, 65, 66, 67, 70, 73, 82, 87, 92, 95], "torch_compile_en": 70, "torch_compile_enable_userbuff": 70, "torch_compile_fullgraph": 70, "torch_compile_inductor_en": 70, "torch_compile_piecewise_cuda_graph": 70, "torchaudio": [66, 67], "torchllmarg": 70, "torchvis": [66, 67], "tostr": [0, 1], "total": [0, 1, 4, 5, 6, 12, 15, 17, 20, 27, 29, 30, 73, 74, 75, 76, 89, 98], "total_lat": [21, 24], "total_token": 88, "totalaccepteddrafttoken": 0, "totaldrafttoken": 0, "totalgentoken": 1, "totalnumpag": 1, "totensor": 0, "touch": [31, 95], "tp": [0, 2, 4, 6, 10, 16, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 56, 73, 74, 75, 82, 93], "tp1": [21, 22, 23], "tp2": 73, "tp4": 26, "tp4ep2": 26, "tp8": [23, 26, 28], "tp8ep2": 26, "tp_1_pp_1": 73, "tp_dim": [17, 83], "tp_group": [82, 83], "tp_rank": [17, 82, 83], "tp_size": [4, 10, 15, 16, 17, 19, 30, 37, 55, 57, 73, 74, 76, 82, 83, 86, 93], "tp_split_dim": 83, "tpot": [24, 74], "tprank": 1, "tpsize": 1, "tqdm": [17, 70, 93], "trace": [19, 29, 30, 72, 92], "track": [5, 8, 70, 82], "trade": [9, 28], "tradeoff": [25, 26, 27, 77], "tradit": 0, "train": [12, 14, 15, 16, 18, 19, 22, 25, 27, 73, 82, 92, 95], "trait": 93, "transa": 82, "transb": 82, "transceiv": [0, 70], "transfer": [0, 2, 16, 28, 52, 70, 93], "transfer_mod": 70, "transferdesc": 0, "transfermod": 0, "transferop": 0, "transferrequest": 0, "transferstatu": 0, "transform": [0, 4, 5, 12, 14, 15, 16, 17, 29, 30, 36, 70, 84, 88, 89, 91, 92, 93, 95, 96, 98], "translat": [81, 93], "transmiss": [2, 11], "transmit": [2, 11], "transpos": [1, 15, 82], "transposit": 82, "travers": 16, "treat": [5, 26, 82], "tree": [0, 73, 87, 92, 98], "tri": [28, 99], "tricki": 84, "trigger": [5, 7, 16, 29, 36, 59, 69], "trim": 1, "trimpool": 1, "triton": [9, 10, 12, 16, 18, 64, 71, 93], "tritonserv": 93, "trivial": 16, "troubleshoot": [64, 93], "trt": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 22, 31, 46, 49, 73, 79, 82, 84, 86, 87, 89, 92, 93, 97], "trt_ckpt": [10, 13, 15, 92], "trt_engin": [10, 13, 15, 92], "trt_root": 20, "trt_tensor": [16, 82], "trtdatatyp": 1, "trtgptmodel": 89, "trtgptmodeloptionalparam": 93, "trtgptmodelv1": 93, "trtllm": [9, 10, 13, 14, 15, 16, 19, 20, 27, 33, 34, 35, 36, 37, 38, 39, 40, 41, 55, 60, 61, 62, 64, 69, 70, 73, 74, 77, 78, 79, 80, 89, 92, 93], "trtllm_dg_jit_use_nvcc": 20, "trtllm_disable_kv_cache_transfer_overlap": 2, "trtllm_disable_unified_convert": 17, "trtllm_enable_kvcache_receive_parallel": 2, "trtllm_enable_mmha_multi_block_debug": 73, "trtllm_enable_pdl": [20, 26, 27, 73], "trtllm_force_xqa": 5, "trtllm_kvcache_send_max_concurrency_num": 2, "trtllm_kvcache_transfer_buffer_s": 2, "trtllm_kvcache_transfer_use_async_buff": 2, "trtllm_mmha_blocks_per_sequ": 73, "trtllm_mmha_kernel_block_s": 73, "trtllm_model": 17, "trtllm_modules_to_hf_modul": 87, "trtllm_parallel_cache_send": 2, "trtllm_pdl_overlap_ratio": 73, "trtllm_precompiled_loc": 65, "trtllm_prefetch_ratio": 73, "trtllm_request_kv_cache_concurr": 2, "trtllm_serv": 30, "trtllm_try_zcopy_for_kvcache_transf": 2, "trtllm_use_mpi_kvcach": 2, "trtllm_use_precompil": 65, "trtllm_use_ucx_kvcach": 2, "trtllmarg": 70, "trtllmattent": 97, "trtlmmdatatyp": 0, "true": [0, 1, 3, 6, 7, 9, 12, 15, 20, 26, 27, 28, 30, 36, 42, 43, 44, 48, 49, 51, 52, 53, 54, 56, 58, 70, 72, 73, 74, 77, 80, 82, 83, 84, 85, 87, 89, 92, 93], "true_output_valu": 82, "true_valu": 82, "truncat": [70, 93], "truncate_prompt_token": [70, 93], "trust": [28, 70], "trust_remote_cod": [30, 70, 93], "try": [0, 1, 3, 14, 19, 53, 58, 69, 74, 77, 79, 80, 81, 88, 89, 92, 94], "tsuji": 73, "ttensor": 1, "ttft": [74, 77, 79, 80, 81, 93], "ttim": 93, "ttl": 26, "tunabl": 78, "tune": [0, 2, 3, 12, 22, 25, 26, 28, 29, 64, 70, 73, 74, 77, 80, 83, 84, 87, 88, 89, 93], "tuner": 0, "tupl": [0, 1, 82, 83, 87, 99], "turn": [5, 6, 9, 12, 28, 65, 77, 87, 89, 93], "tushar": 93, "tweak": 81, "twice": 16, "two": [0, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15, 16, 19, 22, 26, 27, 28, 29, 30, 34, 61, 65, 69, 73, 75, 77, 79, 81, 82, 83, 85, 93, 96, 98, 99], "twofold": 12, "twoshot": 82, "txt": [19, 20, 56, 67, 72, 73, 75, 88, 93], "type": [1, 2, 3, 5, 6, 7, 10, 15, 16, 22, 25, 28, 29, 30, 33, 34, 35, 38, 39, 45, 51, 52, 54, 61, 70, 73, 77, 80, 82, 84, 86, 87, 88, 90, 91, 92, 93, 95, 96, 97, 98], "typedef": [0, 1], "typenam": [0, 1, 16], "typetrait": 0, "typic": [0, 2, 7, 14, 16, 19, 25, 27, 28, 30, 67, 69, 76, 77, 80, 81, 85, 87, 89, 93, 95], "typo": 93, "u": [1, 7, 28, 31, 42, 46, 47, 48, 49, 50, 59, 73, 74, 93], "ub": 82, "ub_oneshot": 73, "ub_tp_siz": 73, "ubuntu": [66, 67, 93, 94], "uc_handl": 1, "uc_ptr": 1, "uc_va": 1, "ucx": [2, 93], "ucx_cuda_copy_async_mem_typ": 2, "ucx_cuda_copy_dmabuf": 2, "ucx_info": 2, "ucx_memtype_cach": 2, "ucx_rndv_frag_mem_typ": 2, "ucx_rndv_pipeline_error_handl": 2, "uid": [0, 87], "uint16_t": 0, "uint32": 1, "uint32_t": [0, 1, 82], "uint64": [1, 9], "uint64_t": [0, 1], "uint8": 1, "uint8_t": [0, 1], "uintptr_t": [0, 1], "uk": 28, "uk_bgemm": 26, "ulimit": [65, 92], "ultim": 76, "ulyss": 93, "unabl": [67, 79], "unaccept": 77, "unari": 82, "unaryoper": 82, "unbind": 82, "uncas": 91, "uncertainti": 12, "unchang": [12, 80, 82], "uncommon": 16, "undefin": 82, "under": [0, 25, 29, 65, 69, 73, 74, 92, 93], "underli": [0, 1, 7, 12], "underlying_type_t": 1, "underlyingtyp": [0, 1], "underscor": 77, "understand": [64, 65, 72], "understood": [70, 79], "underutil": 12, "uneven": 93, "unevenli": 26, "unexpect": [92, 93], "unfinish": 0, "unfus": 82, "unfuse_qkv_project": 84, "ungath": 1, "unguid": 45, "unif": 93, "unifi": [15, 19, 25, 93], "uniform": [73, 74, 82], "uniniti": 97, "uninstal": 67, "union": [70, 82], "uniqu": [0, 5, 6, 8, 10, 12, 15, 29, 70, 73], "unique_ptr": [0, 1], "unique_token": 51, "uniqueconstptr": 1, "uniqueptr": 1, "uniquetoken": 1, "unit": [1, 8, 17, 28, 40, 42, 43, 44, 46, 47, 48, 49, 50, 52, 54, 59, 64, 65, 66, 67, 73, 75, 81, 88, 94], "univers": [42, 46, 47, 49, 50, 52], "unless": [0, 36, 70, 76, 80, 81], "unlik": [9, 12], "unlock": 71, "unnecessari": [7, 93, 95, 99], "unneed": [5, 26], "unordered_map": [0, 1, 3], "unpatchifi": 84, "unschedul": 79, "unset": 81, "unsign": 1, "unspecifi": [29, 30, 82], "unsqueez": [1, 82], "unstabl": 19, "unsupport": 93, "until": [0, 1, 3, 6, 9, 12], "untouch": 82, "unus": [0, 73], "up": [0, 5, 6, 10, 12, 20, 22, 23, 26, 27, 28, 29, 45, 70, 73, 79, 80, 93, 98], "up_proj": 17, "upcast": 82, "upcast_attent": 83, "upcast_softmax": 83, "upcom": [25, 98], "updat": [0, 8, 12, 16, 17, 19, 20, 23, 27, 28, 29, 31, 52, 65, 70, 82, 87, 92, 98], "update_from_dict": 70, "update_key_map": 17, "update_kv_cache_typ": 70, "update_output_ids_by_offset": 87, "update_resourc": [96, 98], "update_strategi": 82, "updatenumreturnbeam": 0, "updatespositionid": 1, "upgrad": [66, 67, 88], "uplift": [77, 79, 80], "upon": [12, 74, 80, 92, 93], "upper": [73, 82, 89], "uq_qr_gemm": 26, "url": [30, 34, 38, 39, 61, 65, 66, 67, 93], "us": [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 36, 40, 41, 45, 48, 55, 56, 57, 58, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 82, 83, 84, 85, 87, 88, 90, 92, 93, 94, 95, 96, 97, 98, 99], "usabl": 94, "usag": [0, 5, 7, 8, 16, 19, 21, 24, 28, 29, 30, 40, 64, 70, 73, 80, 81, 82, 88, 93, 97], "use_beam_hyp": 87, "use_beam_search": [49, 70, 93], "use_cach": [82, 83, 84], "use_context_fmha_for_gener": 93, "use_cuda_graph": [20, 27, 56, 70, 74], "use_custom_all_reduc": 93, "use_diff_of_squar": 82, "use_dynamic_tre": [43, 44, 70], "use_embedding_shar": 93, "use_fp32_acc": 82, "use_fp8": 83, "use_fp8_context_fmha": [5, 29, 73, 93], "use_fused_mlp": [29, 73, 93], "use_gemm_allreduce_plugin": 87, "use_gpt_attention_plugin": 87, "use_gpu_direct_storag": 87, "use_implicit_relative_attent": 83, "use_kv_cach": [70, 83, 87], "use_logn_sc": 83, "use_lora": 84, "use_lora_plugin": 87, "use_mamba_conv1d_plugin": 87, "use_meta_recip": 70, "use_modelopt_ckpt": 54, "use_modelopt_quant": 19, "use_mrop": 70, "use_one_more_block": 87, "use_paged_context_fmha": [5, 9, 29, 73, 77, 80], "use_parallel_embed": [15, 16, 84], "use_preload": 84, "use_prompt_tun": [84, 93], "use_py_sess": 92, "use_refit": 70, "use_relaxed_acceptance_for_think": [26, 27, 70], "use_runtime_default": 87, "use_safetensors_load": 84, "use_strip_plan": 70, "use_tqdm": 70, "use_variable_beam_width_search": 87, "usebantoken": 0, "usebanword": 0, "usecrossattent": 1, "usedefaultvalu": 1, "usednumblock": 0, "usedraftlogit": 1, "usedraftlogitshost": 1, "usedynamictre": 0, "usedynamictreehost": 1, "useexpliciteosstop": 0, "usefrequencypenalti": 0, "usegemmallreduceplugin": 1, "usegptattentionplugin": [1, 6], "usegpudirectstorag": 0, "uselanguageadapt": 1, "useloraplugin": 1, "usemambaconv1dplugin": 1, "usemaxlengthstop": 0, "useminlen": 0, "useminlength": 0, "useminp": 0, "usemrop": 1, "usenorepeatngrams": 0, "useoccurrencepenalti": 0, "usepackedinput": 1, "usepagedst": 1, "usepenalti": 0, "usepositionembed": 1, "usepresencepenalti": 0, "useprogthread": 0, "useprompttun": 1, "user": [0, 2, 3, 5, 6, 7, 9, 10, 11, 16, 17, 18, 19, 20, 24, 25, 26, 27, 28, 30, 31, 33, 34, 43, 44, 45, 49, 52, 53, 54, 60, 61, 65, 69, 70, 72, 73, 74, 79, 80, 81, 82, 84, 88, 89, 90, 92, 93], "user_buff": [29, 77], "userandomacceptancethreshold": 1, "userbuff": [70, 93], "userepetitionpenalti": 0, "userwarn": 67, "useshapeinfer": 1, "usespecdecod": 1, "usestopword": 0, "usetemp": 0, "usetemperatur": 0, "usetokentypeembed": 1, "usevariablebeamwidthsearch": 0, "usr": [15, 20, 30, 33, 34, 35, 37, 38, 39, 67, 73], "usual": [16, 19, 27, 67, 70, 74, 75, 80, 82, 98], "util": [0, 1, 2, 5, 6, 12, 16, 20, 21, 26, 28, 29, 40, 67, 71, 72, 73, 77, 80, 81, 89, 93, 97], "uv": 28, "uv_gemm": 26, "uvm": [0, 1], "v": [1, 2, 5, 6, 10, 20, 21, 22, 25, 26, 28, 64, 82, 84, 87, 90, 91, 92, 95, 97], "v0": [10, 21, 22, 23, 24, 71, 73, 74, 91, 93], "v1": [30, 33, 34, 35, 38, 40, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 58, 59, 60, 61, 62, 66, 67, 69, 88, 91, 93, 94], "v10": 93, "v100": 93, "v12": 93, "v2": [25, 28, 90, 93], "v3": [27, 30, 72, 90, 91, 93], "v9": 23, "v_dim": 82, "v_head_dim": [82, 83], "v_proj": [17, 95], "vacat": [42, 46, 47, 49, 50], "valid": [0, 1, 3, 12, 27, 70, 74, 82, 87], "validate_cuda_graph_config": 70, "validate_cuda_graph_max_batch_s": 70, "validate_positive_valu": 70, "validatevec": 1, "validationerror": 70, "validmpiconfig": 1, "valu": [0, 1, 2, 5, 6, 8, 9, 10, 11, 13, 15, 16, 17, 20, 21, 22, 27, 28, 29, 30, 36, 59, 70, 73, 75, 77, 79, 81, 82, 84, 85, 86, 87, 89, 90, 92, 93, 97, 98, 99], "valuabl": 26, "value_typ": 0, "valuestatu": 1, "vanilla": [5, 97], "vanillaattent": 97, "var": 82, "vari": [23, 79, 80, 98], "variabl": [0, 1, 6, 8, 17, 20, 23, 26, 55, 56, 57, 64, 67, 70, 72, 73, 92, 93], "variabledraftlength": 1, "varianc": [28, 77, 79, 80, 82], "variant": [0, 3, 5, 19, 21, 27, 28, 69, 82, 88, 93, 97], "varieti": [73, 75, 93], "variou": [5, 12, 18, 73, 77, 79, 93], "varnam": 1, "vartyp": 1, "vboost": [20, 26, 73], "vbw": 93, "ve": [26, 58], "vec": [0, 1], "vec2": 82, "veclogprob": 0, "vectoken": 0, "vectokenextraid": [0, 1], "vector": [0, 1, 3, 5, 6, 8, 10, 28, 82], "vecuniquetoken": [0, 1], "verbatim": 84, "verbos": [29, 30, 73], "veri": [5, 15, 16, 18, 25, 27, 75, 76, 77, 93], "verif": [0, 12, 27, 70], "verifi": [12, 27, 64, 80, 82, 93], "verificationsets": 0, "versa": [9, 28], "version": [0, 1, 2, 5, 6, 15, 17, 19, 20, 26, 28, 30, 36, 65, 67, 73, 75, 82, 88, 92, 93, 94], "vertic": 82, "vertical_strid": 83, "vgqa": 8, "via": [0, 2, 11, 12, 26, 55, 56, 57, 58, 65, 67, 73, 77, 78, 80, 81, 82, 93, 94], "vice": [9, 28, 59], "vicuna": [12, 43, 44, 54], "video": [34, 61, 73, 87, 91, 93], "video_grid_thw": 87, "video_path": 87, "video_preprocess": 87, "video_url": [34, 61], "view": [1, 27, 82, 87], "vila": [34, 61, 90, 91, 93], "vinyl": 73, "violat": 93, "virtual": [0, 1, 83], "vision": [87, 90, 91, 93], "vision_grid_thw": 87, "vision_length": 82, "vision_model_typ": 84, "vision_start": 82, "vision_token_mask": 83, "visit": [12, 26, 93], "visual": [79, 93], "visual_engine_dir": 87, "visual_featur": 87, "visualize_network": [29, 70, 93], "vit": 93, "vital": [7, 25], "vl": [30, 34, 39, 61, 73, 91, 93], "vlm": [91, 93], "vocab": [82, 87], "vocab_embed": [14, 17], "vocab_s": [0, 15, 17, 70, 83, 84, 87, 95], "vocab_size_pad": 87, "vocabs": [1, 6], "vocabsizepad": [0, 1], "vocabulari": [0, 1, 6, 9, 12, 74, 83, 87], "void": [0, 1, 3, 16], "volta": 93, "volum": [1, 11, 65, 73], "volumenonneg": 1, "vonjackustc": 93, "vote": [42, 46, 47, 49, 50], "vswa": 8, "vulner": 93, "vultureprim": 93, "w": [1, 24, 26, 28, 30, 82, 84, 90, 91, 93], "w1": 82, "w4a": [90, 93], "w4a16": [15, 25, 64, 70, 84], "w4a16_awq": [15, 19, 36, 59, 70], "w4a16_gptq": [15, 70], "w4a8": [25, 93], "w4a8_awq": [15, 19, 70], "w4a8_qserve_per_channel": 70, "w4a8_qserve_per_group": 70, "w4aint8": 93, "w8a": 90, "w8a16": [15, 25, 64, 70, 84], "w8a16_gptq": 70, "w8a8": [22, 25, 64], "w8a8_sq_per_channel": [15, 70], "w8a8_sq_per_channel_per_tensor_plugin": [70, 84], "w8a8_sq_per_channel_per_token_plugin": [70, 84], "w8a8_sq_per_tensor_per_token_plugin": [70, 84], "w8a8_sq_per_tensor_plugin": [70, 84], "wa": [0, 1, 3, 5, 6, 15, 27, 28, 67, 69, 73, 74, 75, 77, 79, 80, 81, 83, 90, 92, 93, 95, 99], "wai": [2, 5, 7, 11, 18, 26, 27, 28, 50, 52, 69, 71, 73, 75, 77, 82, 89, 93], "wait": [0, 1, 3, 19, 28, 36, 70, 71, 73, 82], "walk": [34, 58, 61, 75, 76, 77], "wang1120": 93, "wangkuiyi": 93, "want": [5, 12, 19, 26, 27, 32, 67, 70, 72, 73, 77, 79, 81, 82, 92, 93, 95], "warm": 98, "warmup": [20, 72, 73, 75, 93, 97, 98], "warn": [5, 29, 30, 70, 73, 74, 89], "warp": [11, 93], "wast": 28, "watch": 80, "wdkv": 26, "wdq": 26, "we": [1, 2, 4, 6, 7, 10, 11, 12, 13, 15, 19, 20, 24, 25, 26, 27, 28, 30, 31, 32, 42, 46, 47, 49, 50, 58, 59, 65, 67, 69, 72, 73, 74, 75, 76, 77, 79, 80, 82, 87, 88, 92, 93, 95], "weapon": 51, "wear": 51, "web": [18, 32], "weig": 82, "weight": [0, 1, 4, 10, 19, 21, 22, 25, 26, 27, 29, 30, 50, 64, 70, 71, 74, 75, 76, 77, 82, 83, 84, 87, 88, 93], "weight_index": 82, "weight_load": 83, "weight_only_groupwise_quant_matmul": 90, "weight_only_precis": 93, "weight_spars": [29, 70], "weight_stream": [13, 29, 70], "weightonlygroupwisequantmatmulplugin": 90, "weights_dict": 19, "weights_scaling_factor": [15, 17], "weightsinpoint": 1, "weightsoutpoint": 1, "well": [5, 6, 16, 18, 22, 36, 72, 79, 80, 90, 91], "were": [0, 1, 12, 15, 19, 21, 25, 28, 74, 76, 79, 93], "weren": 67, "wget": 92, "what": [2, 3, 28, 34, 58, 61, 64, 65, 72, 73, 75, 77, 79, 80], "whatev": 1, "wheel": [65, 67, 93], "when": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 16, 17, 19, 20, 24, 25, 27, 28, 29, 31, 36, 52, 64, 65, 67, 70, 72, 73, 75, 77, 79, 80, 81, 82, 83, 84, 87, 88, 89, 90, 92, 93, 95, 97, 98], "whenev": 1, "where": [0, 1, 2, 5, 6, 8, 9, 11, 12, 15, 16, 21, 25, 26, 27, 28, 30, 33, 35, 36, 58, 60, 62, 70, 73, 74, 77, 79, 81, 82, 87, 88, 90, 93, 99], "wherea": [0, 15, 79], "whether": [0, 1, 2, 3, 5, 10, 29, 70, 76, 77, 80, 82, 83, 87, 96, 97], "which": [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 12, 15, 16, 17, 19, 21, 25, 26, 27, 28, 29, 30, 65, 67, 69, 70, 72, 73, 75, 77, 79, 80, 81, 82, 84, 85, 87, 88, 89, 90, 93, 94, 96, 97, 99], "while": [0, 1, 4, 7, 8, 9, 11, 12, 16, 19, 21, 22, 24, 25, 26, 27, 28, 67, 71, 73, 75, 76, 77, 78, 79, 80, 81, 82, 89, 90, 93, 97], "whisper": [90, 91, 93], "whisperencod": 84, "whl": [20, 65, 66, 67], "who": [27, 69], "whole": [1, 70, 71, 82], "whose": [2, 9, 15, 26, 83], "why": [0, 2, 16, 28, 70, 77, 79, 80, 82, 89], "wide": [0, 4, 27, 70, 75], "width": [0, 1, 5, 6, 39, 70, 83, 87, 89, 93], "win": 70, "window": [0, 1, 8, 12, 29, 64, 70, 73, 82, 87, 93], "window_s": 5, "windows": 0, "wip": 26, "wireless": 45, "wirelessaccesspoint": 45, "wise": [7, 70, 82, 93], "wish": 9, "wit": 51, "with_ssh": 31, "within": [1, 2, 5, 8, 11, 12, 16, 28, 51, 70, 73, 76, 77, 79, 80, 82, 88, 98], "without": [0, 1, 3, 5, 11, 12, 16, 17, 20, 25, 26, 29, 36, 51, 71, 73, 77, 80, 82, 84, 93, 95, 97], "wkr": 26, "wo": [17, 26, 93], "wo_gemm": 26, "won": [67, 76], "word": [0, 3, 5, 70, 82, 87, 93], "word_dict": 87, "word_embed": 17, "word_embeddings_layernorm": 17, "work": [5, 6, 7, 8, 11, 12, 16, 19, 20, 36, 52, 55, 56, 57, 59, 65, 67, 71, 74, 78, 82, 87, 90, 92, 93, 95], "workaround": [17, 20, 93], "workdir": [30, 55, 56, 57, 65], "worker": [16, 29, 30, 70, 73, 89, 93], "workerexecutablepath": 0, "workflow": [5, 6, 14, 15, 20, 27, 36, 64, 69, 74, 75, 77, 78, 82, 88, 92, 93, 94], "workload": [4, 11, 16, 28, 29, 72, 73, 75, 77, 78, 79, 80], "workspac": [1, 29, 30, 70, 73, 82, 89, 93], "workstat": 22, "world": [0, 2, 7, 20, 27, 29, 55, 56, 57, 71, 73, 75, 76, 77, 82], "world_config": 87, "world_siz": [15, 19, 82, 93], "worldconfig": [0, 6, 87], "worldsiz": 1, "wors": [12, 29, 77], "worst": [79, 80], "worth": [5, 8, 77, 80], "would": [0, 7, 12, 27, 73, 75, 77, 79, 81, 82, 95], "wouldn": 51, "wpa2": 45, "wqr": 26, "wrap": [0, 1, 16, 29, 69, 75, 82, 85, 87, 93], "wrapped_properti": 70, "wrapper": [1, 7, 19, 97], "write": [0, 1, 9, 17, 26, 29, 64, 82, 92], "written": [16, 73, 82], "wrong": [12, 51, 93], "wsl": 93, "wuk": 26, "wuq": 26, "wuv": 26, "www": 93, "x": [0, 1, 3, 6, 10, 13, 30, 73, 82, 83, 84, 88, 90, 93], "x86": 9, "x86_64": 91, "xcomposer2": 93, "xgrammar": [0, 3, 45, 93], "xl": 93, "xml": 3, "xor": 82, "xqa": 93, "xxx": [17, 19, 92], "xxx_plugin": 85, "xy": 82, "y": [2, 3, 20, 24, 31, 65, 66, 67, 73, 82, 84, 90], "y_bia": 82, "yaml": [30, 73, 74], "yarn": 82, "ye": [2, 82, 89], "yeah": 58, "yelp": 91, "yen": 73, "yet": [0, 6, 19, 22, 26, 82, 99], "yield": [9, 28, 36, 77, 79], "yiyixu": [34, 61], "yml": [20, 27, 30, 37, 73, 74], "york": [30, 33, 35, 60, 62, 88], "you": [3, 4, 5, 6, 7, 9, 10, 12, 15, 16, 18, 19, 20, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36, 45, 46, 49, 52, 55, 56, 57, 58, 59, 60, 61, 64, 65, 67, 69, 70, 73, 74, 76, 77, 78, 79, 80, 81, 82, 87, 88, 89, 92, 93, 94, 95, 97], "your": [9, 10, 11, 12, 18, 19, 20, 25, 27, 29, 31, 32, 36, 58, 65, 67, 69, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 88, 92, 95, 97, 98], "your_data_path": [20, 27], "your_dockerhub_usernam": [31, 32], "your_model_dir": 27, "your_model_path": 20, "your_public_kei": 32, "your_work_path": 20, "yourself": 94, "yuhuili": [43, 44], "yyi": 92, "z": 82, "zars19": 93, "zero": [0, 1, 3, 17, 69, 70, 82, 83, 90, 92], "zero_is_placehold": 82, "zip": 52, "zjli2013": 93, "zoo": 93}, "titles": ["Executor", "Runtime", "Disaggregated-Service (experimental)", "Executor API", "Expert Parallelism in TensorRT-LLM", "Multi-Head, Multi-Query, and Group-Query Attention", "C++ GPT Runtime", "Graph Rewriting Module", "KV Cache Management: Pools, Blocks, and Events", "KV cache reuse", "Run gpt-2b + LoRA using Executor / cpp runtime", "Low-Precision-AllReduce", "Speculative Sampling", "Running With Weight Streaming to Reduce GPU Memory Consumption", "Adding a Model", "TensorRT-LLM Checkpoint", "Model Definition", "TensorRT-LLM Model Weights Loader", "TensorRT-LLM Architecture", "TensorRT-LLM Build Workflow", "How to get best performance on DeepSeek-R1 in TensorRT-LLM", "Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100", "H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token", "H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM", "New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget", "Speed up inference with SOTA quantization techniques in TRT-LLM", "Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs", "DeepSeek R1 MTP Implementation and Optimization", "Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers", "trtllm-build", "trtllm-serve", "Build the TensorRT-LLM Docker Image", "Develop TensorRT-LLM on Runpod", "Curl Chat Client", "Curl Chat Client For Multimodal", "Curl Completion Client", "LLM Common Customizations", "Deepseek R1 Reasoning Parser", "Genai Perf Client", "Genai Perf Client For Multimodal", "LLM Examples Introduction", "LLM Examples", "Automatic Parallelism with LLM", "Generate Text Using Eagle2 Decoding", "Generate Text Using Eagle Decoding", "Generate text with guided decoding", "Generate text", "Generate Text Asynchronously", "Generate Text in Streaming", "Generate text with customization", "Distributed LLM Generation", "Get KV Cache Events", "Control generated text using logits processor", "Generate Text Using Lookahead Decoding", "Generate Text Using Medusa Decoding", "Llm Mgmn Llm Distributed", "Llm Mgmn Trtllm Bench", "Llm Mgmn Trtllm Serve", "Generate text with multiple LoRA adapters", "Generation with Quantization", "OpenAI Chat Client", "OpenAI Chat Client", "OpenAI Completion Client", "Online Serving Examples", "Welcome to TensorRT-LLM\u2019s Documentation!", "Building from Source Code on Linux", "Installing on Grace Hopper", "Installing on Linux", "Key Features", "API Introduction", "API Reference", "Overview", "Performance Analysis", "TensorRT-LLM Benchmarking", "Overview", "Benchmarking Default Performance", "Deciding Model Sharding Strategy", "FP8 Quantization", "Performance Tuning Guide", "Tuning Max Batch Size and Max Num Tokens", "Useful Build-Time Flags", "Useful Runtime Options", "Functionals", "Layers", "Models", "Plugin", "Quantization", "Runtime", "Quick Start Guide", "Memory Usage of TensorRT-LLM", "Numerical Precision", "Support Matrix", "Troubleshooting", "Release Notes", "PyTorch Backend", "Adding a New Model in PyTorch Backend", "Architecture Ovewiew", "Attention", "KV Cache Manager", "Scheduler"], "titleterms": {"": [5, 22, 25, 64], "0": 93, "000": [22, 23], "1": [14, 16, 20, 65, 74, 89, 93], "10": [22, 93], "100m": 22, "11": 93, "12": [23, 93], "13": 93, "13b": 23, "14": 93, "15": 93, "16": 93, "17": 93, "18": 93, "180b": 21, "19": 93, "2": [14, 20, 24, 65, 89, 93], "2b": 10, "3": [14, 16, 20, 73, 74, 89, 91], "4": [14, 20, 22], "405b": [16, 74], "4x": 24, "5": 20, "6": [20, 21], "6x": 22, "7": 93, "70b": [16, 21, 24, 73, 74], "7x": 21, "8": 93, "8b": 74, "9": 93, "A": 28, "As": 3, "For": [34, 39], "In": [3, 5, 71], "Not": 89, "One": [26, 65], "The": [3, 90], "To": 75, "With": [13, 71], "a100": [21, 22], "about": [12, 30, 71, 76], "absorb": 28, "accept": [26, 27], "access": 31, "account": 32, "accuraci": [11, 25, 27], "achiev": [22, 23, 27], "acknowledg": [26, 27, 28], "activ": [83, 89], "ad": [14, 95], "adapt": 58, "addit": 3, "adp": 28, "advanc": 64, "algorithm": 11, "alibi": 5, "allreduc": 11, "an": 8, "analysi": 72, "announc": 93, "api": [3, 7, 13, 19, 30, 40, 69, 70, 75, 88, 93, 96], "arbitrari": 3, "architectur": [18, 26, 64, 96], "argument": 29, "asynchron": 47, "asyncio": 36, "attent": [5, 15, 26, 27, 28, 71, 79, 80, 81, 83, 97], "attentionbackend": 97, "attentionmetadata": 97, "auto": 29, "automat": 42, "autoregress": 26, "avoid": 75, "awq": [15, 21, 90], "b200": [20, 26], "backend": [26, 91, 94, 95, 97], "background": [26, 27], "balanc": 26, "base": [27, 36], "baselin": 77, "basic": 27, "batch": [3, 5, 71, 79], "beam": [3, 5], "befor": [73, 75], "begin": 75, "behavior": 73, "bench": [56, 72, 75], "benchmark": [2, 20, 25, 30, 73, 74, 75], "best": [20, 25], "bf16": 90, "bia": 5, "bind": [3, 16, 65], "blackwel": [28, 90], "block": 8, "blockmanag": 8, "boost": 73, "boundari": 26, "budget": 24, "buffer": [5, 77, 89], "buffermanag": 1, "build": [15, 19, 20, 29, 31, 32, 36, 65, 73, 75, 80], "c": [3, 6, 65, 89], "cach": [5, 8, 9, 15, 51, 77, 81, 89, 98], "cachecommun": 0, "can": [9, 71], "capac": 81, "case": 79, "cast": 83, "caveat": 73, "chang": [13, 79, 93], "chat": [30, 33, 34, 60, 61], "checkpoint": 15, "choos": 25, "chunk": [5, 20, 79, 81], "class": 3, "classic": 7, "cli": [19, 75], "client": [33, 34, 35, 38, 39, 60, 61, 62], "clock": [20, 73], "close": [21, 24], "code": 65, "collect": 72, "combin": 20, "come": 25, "command": 74, "common": [1, 36, 71], "commun": [26, 76], "compil": [16, 20, 65, 88], "complet": [30, 35, 62], "compon": [6, 94], "conclus": [77, 79, 80], "config": [15, 29], "configur": [3, 6, 10, 26, 32, 36, 77, 80, 95], "connect": 32, "consider": 11, "consumpt": 13, "contain": [20, 31, 65], "content": [20, 26, 27, 28, 78, 95], "context": [3, 5, 20, 79, 80, 81], "contigu": 5, "control": [3, 52], "conv": 83, "convers": [14, 19], "coordin": 72, "core": 95, "cpp": 10, "creat": [32, 65], "cross": 5, "cuda": 26, "cudaev": 1, "cudastream": 1, "curl": [33, 34, 35], "custom": [17, 36, 49, 98, 99], "cutlass": 26, "cyclic": 5, "data": 28, "dataset": [20, 73, 74, 75], "datatransceiverst": 0, "debug": [2, 72, 92], "decid": 76, "decod": [3, 12, 27, 29, 43, 44, 45, 53, 54, 89, 96], "decoderst": 1, "decodinginput": 1, "decodingoutput": 1, "decor": 7, "deep": 28, "deepseek": [20, 26, 27, 28, 37], "default": [20, 26, 73, 75], "definit": [16, 88, 95], "dens": 26, "depend": 26, "deploi": 88, "dequant": 90, "descript": 72, "detail": [10, 90], "develop": [28, 32, 94], "diagram": 26, "differ": 3, "disabl": 36, "disaggreg": [2, 30], "disaggregated_mpi_work": 30, "disaggserverutil": 0, "distribut": [50, 55], "dive": 28, "do": 71, "docker": [31, 32, 65], "dockerhub": [31, 32], "document": [64, 93], "dora": 10, "download": 20, "dq": 90, "draft": 12, "e2": 92, "eagl": [12, 27, 44], "eagle2": 43, "eagle3": 27, "eaglebuff": 1, "eaglemodul": 1, "embed": [5, 83], "enabl": [4, 9, 20, 31, 72, 77, 80], "endpoint": 30, "engin": [15, 16, 69, 73, 75, 88, 96], "enhanc": 93, "environ": [2, 11], "ep": 28, "error": 92, "etp": 26, "evalu": [15, 27], "event": [8, 51], "everyth": 26, "exampl": [2, 3, 10, 15, 16, 17, 40, 41, 63, 72, 73], "except": 89, "execut": 92, "executor": [0, 3, 10], "expect": [9, 20], "experiment": 2, "expert": [4, 26, 28], "explicitdrafttokensbuff": 1, "explor": 20, "face": 69, "factor": [5, 15], "falcon": 21, "faq": [2, 89], "faster": 21, "featur": [20, 68, 72, 93], "file": 65, "first": 22, "fix": [27, 93], "flag": [80, 90], "flayerinfo": 7, "flight": [3, 5, 71], "flow": 73, "fmha": 5, "format": [10, 20], "fp16": 90, "fp32": 90, "fp4": 74, "fp8": [5, 15, 22, 71, 74, 77, 90], "fraction": 81, "free": 81, "from": 65, "full": 65, "fulli": 17, "function": [7, 17, 82], "fuse_a_gemm": 26, "fusion": [16, 26, 77, 80], "futur": [26, 27, 28, 36], "garbag": 72, "gate": 77, "gc": 72, "gemm": [26, 77, 80], "genai": [38, 39], "gener": [2, 5, 36, 43, 44, 45, 46, 47, 48, 49, 50, 52, 53, 54, 58, 59], "get": [20, 51, 64], "gil": 72, "gpt": [6, 10], "gptdecod": 1, "gptdecoderbatch": 1, "gptjsonconfig": 1, "gptq": 90, "gpu": [13, 16, 20, 21, 26, 28, 71, 73, 81, 89], "grace": 66, "graph": [7, 26], "group": [5, 26], "guid": [3, 45, 78, 88, 94, 95], "h": [0, 1], "h100": [22, 23], "h200": [20, 21, 23, 24], "ha": 22, "hardwar": 91, "hbm": 23, "head": 5, "header": 65, "hierarchi": 8, "high": 7, "hopper": [66, 90], "host": 9, "how": [4, 9, 20, 26, 27, 28, 73, 76, 79], "hub": 69, "hug": 69, "i": [22, 76, 89], "ibuff": 1, "id": 10, "igptdecoderbatch": 1, "imag": [31, 32, 65], "implement": [14, 26, 27, 97], "import": 5, "improv": 12, "increas": 24, "indic": 64, "infer": [3, 25, 27, 30, 71, 88, 89], "inform": [7, 72, 88], "infrastructur": 93, "input": 5, "instal": [20, 64, 66, 67, 92], "int4": [21, 90], "int8": [5, 90], "interfac": 98, "intern": 6, "introduct": [28, 40, 69, 95, 98, 99], "ipcnvlsmemori": 1, "ipcutil": 1, "isl": 20, "issu": [20, 27, 89, 93, 94], "itensor": 1, "iter": 72, "kei": [17, 26, 32, 68, 76, 93, 94], "kernel": [24, 26], "knowledg": 78, "known": [27, 65, 89, 93, 94], "kv": [5, 8, 9, 15, 51, 77, 81, 89, 98], "kvcacheeventmanag": 8, "kvcachemanag": 96, "latenc": [20, 24, 26, 73, 75, 77], "latest": [23, 71], "launch": [26, 72], "layer": [26, 28, 83], "layernorm": 15, "layout": 17, "level": [7, 26, 96], "limit": [12, 65, 73, 93], "linear": 83, "link": 65, "linux": [65, 67], "llama": [16, 21, 24, 73, 74, 77, 80], "llama2": 23, "llm": [4, 12, 15, 17, 18, 19, 20, 22, 23, 25, 27, 31, 32, 36, 40, 41, 42, 50, 55, 56, 57, 64, 65, 69, 71, 73, 75, 79, 88, 89, 91, 93], "load": [17, 95], "loader": 17, "local": 69, "logit": [3, 29, 52], "lookahead": [12, 53], "lookaheadbuff": 1, "lookaheadmodul": 1, "lookup": 12, "lora": [10, 29, 58], "loracach": [1, 10], "loracachepagemanagerconfig": 1, "loramodul": 1, "low": [11, 73, 77], "make": 15, "manag": [7, 8, 73, 98], "map": [10, 73], "mark": 3, "marker": 72, "match": 16, "matrix": [90, 91], "max": [20, 73, 79, 81], "maximum": 81, "measur": 74, "medusa": [12, 54, 73], "medusamodul": 1, "memori": [9, 13, 20, 23, 81, 89], "memorycount": 1, "method": [7, 25], "metric": 30, "mgmn": [55, 56, 57], "min": 20, "mix": 26, "mixtur": 4, "mla": 28, "mlp": [15, 77, 83], "mlperf": 22, "modal": [73, 91], "mode": 73, "model": [6, 12, 14, 16, 17, 18, 20, 26, 27, 69, 73, 74, 76, 77, 80, 84, 88, 91, 92, 93, 95, 96], "modelconfig": 1, "modul": [7, 10, 27, 28], "moe": [4, 28], "moe_backend": 26, "more": [20, 24, 72], "mqa": 28, "mtp": [26, 27], "multi": [5, 16, 26, 30, 71, 73, 91], "multimod": [30, 34, 39], "multipl": [58, 80], "name": [17, 29], "nativ": [17, 71], "nearli": 23, "network": 73, "new": [14, 24, 95, 97], "next": [25, 88], "node": [16, 30, 71], "non": 73, "norm": [77, 80], "normal": 83, "note": [3, 5, 93], "nsight": 72, "num": 79, "numer": 90, "nvfp4": 90, "nvidia": [26, 28, 72], "nvtx": 72, "o": 89, "obtain": 3, "offload": 9, "onli": [26, 65, 72, 90], "onlin": 63, "openai": [60, 61, 62], "optim": [5, 26, 27, 28, 80], "option": [20, 65, 77, 80, 81], "osl": 20, "other": 73, "out": [20, 95], "output": [3, 73], "over": 21, "overview": [6, 15, 17, 19, 71, 74], "ovewiew": 96, "own": 99, "p": 9, "pack": 5, "pad": 5, "page": [5, 8, 71, 79, 80, 81], "parallel": [4, 10, 26, 28, 29, 42, 73, 76, 80], "paramet": 6, "parser": 37, "part": 14, "pattern": [7, 16], "perf": [38, 39], "perform": [9, 11, 12, 20, 22, 25, 26, 64, 72, 75, 77, 78, 80], "persist": 73, "phase": 5, "pipelin": [76, 80], "pitfal": 75, "plugin": [16, 29, 77, 80, 85], "pod": 32, "polici": 81, "pool": [8, 83, 89], "posit": 5, "post": 3, "postprocess": 17, "power": 73, "practic": 25, "precis": [11, 26, 28, 90], "prepar": [15, 20, 32, 69, 73, 74, 75], "prerequisit": [20, 65, 78, 88, 95], "prevent": 9, "processor": [3, 52], "profil": [26, 72, 80], "programmat": 26, "prompt": 12, "prompttuningparam": 1, "provid": 24, "push": 26, "pyexecutor": 96, "python": [3, 65, 89], "pytorch": [72, 73, 91, 94, 95], "q": 90, "qkv": 5, "quantiz": [15, 19, 25, 36, 59, 73, 77, 86, 90, 94], "quantmod": 90, "queri": 5, "quick": [88, 94], "quickstart": 73, "r1": [20, 26, 27, 28, 37], "rab": 5, "rank": 15, "rawengin": 1, "re": 26, "reason": 37, "recommend": [77, 80, 89], "record_signatur": 7, "redraft": 12, "reduc": [13, 77, 80], "refer": [14, 64, 70], "regist": 14, "registr": 95, "rel": 5, "relat": [7, 88], "relax": [26, 27], "releas": 93, "reproduc": [20, 26, 28, 74], "request": [1, 3], "requir": [7, 11], "resourcemanag": 96, "respons": 3, "result": [3, 20, 72, 74, 75], "retriev": 7, "reus": 9, "revisit": 79, "rewrit": 7, "right": 25, "roll": 5, "rope": 5, "rotari": 5, "router": 26, "routergemm": 26, "run": [10, 13, 20, 27, 72, 73, 74, 75, 88], "runpod": 32, "runtim": [1, 6, 10, 16, 28, 36, 65, 81, 87, 89], "runtimedefault": 1, "same": 24, "sampl": [6, 12, 36], "samplingconfig": 1, "save": 75, "scale": [5, 15], "scatter": 80, "schedul": [79, 81, 96, 99], "script": [41, 63], "search": 5, "sec": 23, "send": 3, "serial": 0, "serv": [30, 57, 63, 72, 88], "server": [3, 30, 88], "servic": 2, "set": [73, 76], "shard": 76, "shoot": 17, "singl": 21, "situat": 9, "size": [79, 81, 89], "slide": 5, "slurm": 30, "smart": 26, "smoothquant": 90, "softwar": 91, "sota": 25, "sourc": 65, "spars": 26, "specif": 72, "specul": [12, 27, 29], "speculativedecodingmod": 1, "speculativedecodingmodul": 1, "speed": 25, "speedup": 27, "ssh": [31, 32], "start": [30, 64, 88, 94], "step": [14, 20, 65, 88, 95], "strategi": [26, 28, 76], "stream": [13, 26, 48], "streamingllm": 5, "structur": 3, "studi": [27, 79], "style": 36, "subcommand": 73, "summari": [73, 77, 80], "support": [16, 17, 20, 27, 65, 69, 71, 73, 90, 91], "swiglu": 77, "syntax": 30, "system": [26, 72], "tabl": [20, 26, 27, 28, 64, 78, 95], "target": 12, "technic": 90, "techniqu": 25, "templat": 32, "tensor": [0, 3, 4, 5, 7, 10, 76, 89], "tensorrt": [4, 12, 15, 16, 17, 18, 19, 20, 22, 23, 25, 27, 31, 32, 64, 65, 69, 71, 73, 75, 79, 88, 89, 91, 93], "test": 92, "text": [43, 44, 45, 46, 47, 48, 49, 52, 53, 54, 58], "think": 76, "throughput": [20, 24, 28, 73, 74, 75], "time": [80, 89], "tip": [69, 75, 92], "tllmlogger": 1, "tok": 22, "token": [22, 23, 36, 79, 81], "tool": 19, "top": 96, "topologi": 11, "transferag": 0, "translat": 17, "tree": [12, 27, 95], "trigger": 8, "triton": [3, 88], "troubl": 17, "troubleshoot": [2, 69, 75, 92], "trt": 25, "trtllm": [26, 29, 30, 56, 57, 72, 75, 88], "tune": [9, 20, 78, 79], "type": [0, 8], "understand": [79, 89], "unit": 92, "up": [21, 24, 25], "updat": 93, "upload": [31, 32], "us": [7, 10, 12, 43, 44, 52, 53, 54, 80, 81, 89], "usag": [2, 11, 89], "user": 77, "v": [4, 23], "valid": 73, "vanilla": 27, "variabl": [2, 11, 74], "verif": 26, "verifi": 14, "via": 75, "visual": 72, "w4a16": 90, "w8a16": 90, "w8a8": 90, "weight": [13, 14, 15, 16, 17, 18, 28, 89, 90, 95], "welcom": 64, "what": [8, 22, 25, 71], "when": [7, 26], "width": 3, "window": [5, 71, 81], "windowblockmanag": 8, "wip": 20, "within": 24, "without": 65, "work": [26, 27, 28, 73], "workflow": [7, 17, 19, 72, 73], "workload": 26, "world": 6, "worldconfig": 1, "write": 14, "xqa": [5, 24], "you": [71, 75], "your": 99}})
\ No newline at end of file
diff --git a/latest/torch.html b/latest/torch.html
index a67602b722..e23065ca67 100644
--- a/latest/torch.html
+++ b/latest/torch.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -63,7 +63,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -336,6 +336,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -357,6 +358,7 @@
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -421,6 +423,7 @@
 <li class="toctree-l1"><a class="reference internal" href="advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -455,6 +458,7 @@
 <li class="toctree-l1"><a class="reference internal" href="blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -730,6 +734,15 @@ scripts/huggingface_example.sh<span class="w"> </span>--model<span class="w"> </
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/torch/adding_new_model.html b/latest/torch/adding_new_model.html
index b728120d1e..07ec6e7360 100644
--- a/latest/torch/adding_new_model.html
+++ b/latest/torch/adding_new_model.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -841,6 +845,15 @@
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/torch/arch_overview.html b/latest/torch/arch_overview.html
index e6f16a6162..cbd02c40c2 100644
--- a/latest/torch/arch_overview.html
+++ b/latest/torch/arch_overview.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -707,6 +711,15 @@ The document <a class="reference internal" href="kv_cache_manager.html"><span cl
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/torch/attention.html b/latest/torch/attention.html
index 6690997c5c..3f4be2531c 100644
--- a/latest/torch/attention.html
+++ b/latest/torch/attention.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -518,7 +522,7 @@ The following sections explain how to use these implementations and provide a br
 <section id="attention-backends">
 <h2>Attention Backends<a class="headerlink" href="#attention-backends" title="Link to this heading">#</a></h2>
 <p>There are currently three available attention backends: the vanilla backend, the TRT-LLM backend, and the Flashinfer backend.
-You can specify the desired attention backend using <code class="docutils literal notranslate"><span class="pre">PyTorchConfig.attn_backend</span></code>. For instance, to utilize the Flashinfer backend, you can create a <code class="docutils literal notranslate"><span class="pre">PyTorchConfig</span></code> with <code class="docutils literal notranslate"><span class="pre">attn_backend</span> <span class="pre">=</span> <span class="pre">&quot;flashinfer&quot;</span></code> and then pass it to the <code class="docutils literal notranslate"><span class="pre">LLM</span></code> constructor as follows: <code class="docutils literal notranslate"><span class="pre">LLM(pytorch_backend_config=pytorch_config)</span></code>. This will enable the use of the Flashinfer backend for your model.</p>
+You can specify the desired attention backend using <code class="docutils literal notranslate"><span class="pre">PyTorchConfig.attn_backend</span></code>. For instance, to utilize the Flashinfer backend, you can pass <code class="docutils literal notranslate"><span class="pre">attn_backend=&quot;flashinfer&quot;</span></code> to the <code class="docutils literal notranslate"><span class="pre">LLM</span></code> constructor as follows: <code class="docutils literal notranslate"><span class="pre">LLM(attn_backend=&quot;flashinfer&quot;)</span></code>. This will enable the use of the Flashinfer backend for your model.</p>
 <p>The vanilla backend, <code class="docutils literal notranslate"><span class="pre">VanillaAttention</span></code>, is a reference implementation designed primarily for inflight batching and linear KV cache support. While it serves as a useful baseline, it is not recommended for production use due to its limited optimizations.</p>
 <p>In contrast, the Flashinfer backend, <code class="docutils literal notranslate"><span class="pre">FlashInferAttention</span></code>, is performance-optimized and supports both inflight batching and paged KV cache. It also includes the following advanced features:</p>
 <ol class="arabic simple">
@@ -831,6 +835,15 @@ For example, the Flashinfer metadata fills <code class="docutils literal notrans
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/torch/kv_cache_manager.html b/latest/torch/kv_cache_manager.html
index 0f379514aa..d37fa73257 100644
--- a/latest/torch/kv_cache_manager.html
+++ b/latest/torch/kv_cache_manager.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -509,6 +513,7 @@
 <p>In Transformer-based models, the KV (Key-Value) Cache is a mechanism used to optimize decoding efficiency, particularly during autoregressive generation tasks.
 Since KV Cache requires memory to store, it is also an important resource.
 In TensorRT-LLM, KV Cache is managed by the <code class="docutils literal notranslate"><span class="pre">KVCacheManager</span></code>.</p>
+<p>For details of the TensorRT-LLM <code class="docutils literal notranslate"><span class="pre">KVCacheManager</span></code> implementation see <a class="reference internal" href="../advanced/kv-cache-management.html"><span class="std std-doc">KV Cache Management</span></a>.</p>
 <section id="kv-cache-manager-introduction">
 <h2>KV Cache Manager Introduction<a class="headerlink" href="#kv-cache-manager-introduction" title="Link to this heading">#</a></h2>
 <p><code class="docutils literal notranslate"><span class="pre">KVCacheManager</span></code> is a type of resource manager, inheriting from <code class="docutils literal notranslate"><span class="pre">BaseResourceManager</span></code>.
@@ -699,6 +704,15 @@ Then, test it to ensure the <code class="docutils literal notranslate"><span cla
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   
diff --git a/latest/torch/scheduler.html b/latest/torch/scheduler.html
index 912efd872e..225fef0c00 100644
--- a/latest/torch/scheduler.html
+++ b/latest/torch/scheduler.html
@@ -51,7 +51,7 @@
     <script>
         DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
         DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
-        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
+        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.21.0rc0';
         DOCUMENTATION_OPTIONS.show_version_warning_banner =
             false;
         </script>
@@ -61,7 +61,7 @@
 
   <meta name="viewport" content="width=device-width, initial-scale=1"/>
   <meta name="docsearch:language" content="en"/>
-  <meta name="docsearch:version" content="0.20.0rc3" />
+  <meta name="docsearch:version" content="0.21.0rc0" />
 
 
   </head>
@@ -334,6 +334,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -355,6 +356,7 @@
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
+<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle2_decoding.html">Generate Text Using Eagle2 Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
 <li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
@@ -419,6 +421,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-management.html">KV Cache Management: Pools, Blocks, and Events</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
@@ -453,6 +456,7 @@
 <li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
 <li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
 </ul>
 </div>
 </nav></div>
@@ -725,6 +729,15 @@ In the <code class="docutils literal notranslate"><span class="pre">create_pytor
   </p>
 </div>
       
+        <div class="footer-item">
+<div class="extra_footer">
+  
+  <p>Last updated on June 03, 2025.</p>
+  
+  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ae2ce6">9ae2ce6</a>.</p>
+  
+</div></div>
+      
     </div>
   
   

Components	Parallel Patterns
Attention Modules	Data Parallelism 8 (DP8)
MoE Sparse Experts	Expert Parallelism 8 (EP8)
MoE Shared Experts	DP8
Fuse_A GEMM	DP8
Router GEMM	DP8
GEMM	group	GEMM N	GEMM K
shared_fc1	1	4096	7168
shared_fc2	1	7168	2048
sparse_fc1	256	4096	7168
sparse_fc2	256	7168	2048
KV Cache Type	FP8 Checkpoint	FP4 Checkpoint
BF16 MLA and KV cache	0.9629	0.9606
FP8 MLA and KV cache	0.9613	0.9606