TensorRT-LLMs/llm-api/reference.html
2025-05-20 09:23:51 +00:00

3678 lines
492 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html lang="en" data-content_root="../" >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>API Reference &#8212; TensorRT-LLM</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
</script>
<!--
this give us a css class that will be invisible only if js is disabled
-->
<noscript>
<style>
.pst-js-only { display: none !important; }
</style>
</noscript>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
<link rel="stylesheet" type="text/css" href="../_static/styles/nvidia-sphinx-theme.css?v=df3ac72c" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
<link rel="stylesheet" type="text/css" href="../_static/autodoc_pydantic.css" />
<!-- So that users can add custom icons -->
<script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
<script src="../_static/doctools.js?v=9a2dae69"></script>
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
<script src="../_static/copybutton.js?v=65e89d2a"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'llm-api/reference';</script>
<script>
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '0.20.0rc3';
DOCUMENTATION_OPTIONS.show_version_warning_banner =
false;
</script>
<link rel="icon" href="../_static/favicon.png"/>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="LLM Examples Introduction" href="../examples/index.html" />
<link rel="prev" title="API Introduction" href="index.html" />
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<meta name="docsearch:language" content="en"/>
<meta name="docsearch:version" content="0.20.0rc3" />
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
<div id="pst-scroll-pixel-helper"></div>
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
<dialog id="pst-search-dialog">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form>
</dialog>
<div class="pst-async-banner-revealer d-none">
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
<div class="bd-header__inner bd-page-width">
<button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
<span class="fa-solid fa-bars"></span>
</button>
<div class="col-lg-3 navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
<img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
<p class="title logo__title">TensorRT-LLM</p>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item">
<div class="version-switcher__container dropdown pst-js-only">
<button id="pst-version-switcher-button-2"
type="button"
class="version-switcher__button btn btn-sm dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-2"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-2"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-2">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
<span class="search-button__default-text">Search</span>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
</div>
<button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
<span class="fa-solid fa-outdent"></span>
</button>
</div>
</header>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<dialog id="pst-primary-sidebar-modal"></dialog>
<div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
<a class="navbar-brand logo" href="../index.html">
<img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
<img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
<p class="title logo__title">TensorRT-LLM</p>
</a>
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item">
<div class="version-switcher__container dropdown pst-js-only">
<button id="pst-version-switcher-button-3"
type="button"
class="version-switcher__button btn btn-sm dropdown-toggle"
data-bs-toggle="dropdown"
aria-haspopup="listbox"
aria-controls="pst-version-switcher-list-3"
aria-label="Version switcher list"
>
Choose version <!-- this text may get changed later by javascript -->
<span class="caret"></span>
</button>
<div id="pst-version-switcher-list-3"
class="version-switcher__menu dropdown-menu list-group-flush py-0"
role="listbox" aria-labelledby="pst-version-switcher-button-3">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item">
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
</button></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item">
<nav class="bd-docs-nav bd-links"
aria-label="Table of Contents">
<p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
<div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="../torch.html">PyTorch Backend</a></li>
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Installation</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">LLM API</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="index.html">API Introduction</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">API Reference</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Examples</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/index.html">LLM Examples Introduction</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="../examples/customization.html">LLM Common Customizations</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_eagle_decoding.html">Generate Text Using Eagle Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_kv_events.html">Get KV Cache Events</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_customize.html">Generate text with customization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Llm Mgmn Llm Distributed</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Llm Mgmn Trtllm Bench</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Llm Mgmn Trtllm Serve</a></li>
</ul>
</details></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client.html">Curl Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_completion_client.html">Curl Completion Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client.html">Genai Perf Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
</ul>
</details></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">C++ API</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Architecture</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Advanced</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/executor.html">Executor API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using Executor / cpp runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-benchmarking.html">Benchmarking</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../performance/performance-tuning-guide/index.html">Performance Tuning Guide</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/benchmarking-default-performance.html">Benchmarking Default Performance</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-build-time-flags.html">Useful Build-Time Flags</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/tuning-max-batch-size-and-max-num-tokens.html">Tuning Max Batch Size and Max Num Tokens</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/deciding-model-sharding-strategy.html">Deciding Model Sharding Strategy</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/fp8-quantization.html">FP8 Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="../performance/performance-tuning-guide/useful-runtime-flags.html">Useful Runtime Options</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
</div>
<main id="main-content" class="bd-main" role="main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumb" class="d-print-none">
<ul class="bd-breadcrumbs">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">API Reference</span></li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article">
<section id="api-reference">
<h1>API Reference<a class="headerlink" href="#api-reference" title="Link to this heading">#</a></h1>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">LLM</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">PreTrainedTokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'auto'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'slow'</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">skip_tokenizer_init</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">trust_remote_code</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tensor_parallel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">dtype</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer_revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Any</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>LLM class is the main class for running a LLM model.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>model</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>pathlib.Path</em><em>]</em>) The path to the model checkpoint or the model name from the Hugging Face Hub.</p></li>
<li><p><strong>tokenizer</strong> (<em>Union</em><em>[</em><em>str</em><em>, </em><em>pathlib.Path</em><em>, </em><em>transformers.tokenization_utils_base.PreTrainedTokenizerBase</em><em>, </em><em>tensorrt_llm.llmapi.tokenizer.TokenizerBase</em><em>, </em><em>NoneType</em><em>]</em>) The path to the tokenizer checkpoint or the tokenizer name from the Hugging Face Hub. Defaults to None.</p></li>
<li><p><strong>tokenizer_mode</strong> (<em>Literal</em><em>[</em><em>'auto'</em><em>, </em><em>'slow'</em><em>]</em>) The mode to initialize the tokenizer. Defaults to auto.</p></li>
<li><p><strong>skip_tokenizer_init</strong> (<em>bool</em>) Whether to skip the tokenizer initialization. Defaults to False.</p></li>
<li><p><strong>trust_remote_code</strong> (<em>bool</em>) Whether to trust the remote code. Defaults to False.</p></li>
<li><p><strong>tensor_parallel_size</strong> (<em>int</em>) The tensor parallel size. Defaults to 1.</p></li>
<li><p><strong>dtype</strong> (<em>str</em>) The data type to use for the model. Defaults to auto.</p></li>
<li><p><strong>revision</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) The revision to use for the model. Defaults to None.</p></li>
<li><p><strong>tokenizer_revision</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) The revision to use for the tokenizer. Defaults to None.</p></li>
<li><p><strong>pipeline_parallel_size</strong> (<em>int</em>) The pipeline parallel size. Defaults to 1.</p></li>
<li><p><strong>context_parallel_size</strong> (<em>int</em>) The context parallel size. Defaults to 1.</p></li>
<li><p><strong>gpus_per_node</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The number of GPUs per node. Defaults to None.</p></li>
<li><p><strong>moe_cluster_parallel_size</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The cluster parallel size for MoE modelss expert weights. Defaults to None.</p></li>
<li><p><strong>moe_tensor_parallel_size</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The tensor parallel size for MoE modelss expert weights. Defaults to None.</p></li>
<li><p><strong>moe_expert_parallel_size</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The expert parallel size for MoE modelss expert weights. Defaults to None.</p></li>
<li><p><strong>enable_attention_dp</strong> (<em>bool</em>) Enable attention data parallel. Defaults to False.</p></li>
<li><p><strong>cp_config</strong> (<em>Optional</em><em>[</em><em>dict</em><em>]</em>) Context parallel config. Defaults to None.</p></li>
<li><p><strong>auto_parallel</strong> (<em>bool</em>) Enable auto parallel mode. Defaults to False.</p></li>
<li><p><strong>auto_parallel_world_size</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The world size for auto parallel mode. Defaults to None.</p></li>
<li><p><strong>load_format</strong> (<em>Literal</em><em>[</em><em>'auto'</em><em>, </em><em>'dummy'</em><em>]</em>) The format to load the model. Defaults to auto.</p></li>
<li><p><strong>enable_tqdm</strong> (<em>bool</em>) Enable tqdm for progress bar. Defaults to False.</p></li>
<li><p><strong>enable_lora</strong> (<em>bool</em>) Enable LoRA. Defaults to False.</p></li>
<li><p><strong>lora_config</strong> (<em>Optional</em><em>[</em><em>tensorrt_llm.lora_manager.LoraConfig</em><em>]</em>) LoRA configuration for the model. Defaults to None.</p></li>
<li><p><strong>enable_prompt_adapter</strong> (<em>bool</em>) Enable prompt adapter. Defaults to False.</p></li>
<li><p><strong>max_prompt_adapter_token</strong> (<em>int</em>) The maximum number of prompt adapter tokens. Defaults to 0.</p></li>
<li><p><strong>quant_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig" title="tensorrt_llm.models.modeling_utils.QuantConfig"><em>tensorrt_llm.models.modeling_utils.QuantConfig</em></a><em>]</em>) Quantization config. Defaults to None.</p></li>
<li><p><strong>calib_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig" title="tensorrt_llm.llmapi.llm_args.CalibConfig"><em>tensorrt_llm.llmapi.llm_args.CalibConfig</em></a><em>]</em>) Calibration config. Defaults to None.</p></li>
<li><p><strong>build_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.BuildConfig" title="tensorrt_llm.builder.BuildConfig"><em>tensorrt_llm.builder.BuildConfig</em></a><em>]</em>) Build config. Defaults to None.</p></li>
<li><p><strong>kv_cache_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheConfig" title="tensorrt_llm.llmapi.llm_args.KvCacheConfig"><em>tensorrt_llm.llmapi.llm_args.KvCacheConfig</em></a><em>]</em>) KV cache config. Defaults to None.</p></li>
<li><p><strong>enable_chunked_prefill</strong> (<em>bool</em>) Enable chunked prefill. Defaults to False.</p></li>
<li><p><strong>guided_decoding_backend</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) Guided decoding backend. Defaults to None.</p></li>
<li><p><strong>batched_logits_processor</strong> (<em>Optional</em><em>[</em><em>tensorrt_llm.sampling_params.BatchedLogitsProcessor</em><em>]</em>) Batched logits processor. Defaults to None.</p></li>
<li><p><strong>iter_stats_max_iterations</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The maximum number of iterations for iter stats. Defaults to None.</p></li>
<li><p><strong>request_stats_max_iterations</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The maximum number of iterations for request stats. Defaults to None.</p></li>
<li><p><strong>workspace</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) The workspace for the model. Defaults to None.</p></li>
<li><p><strong>embedding_parallel_mode</strong> (<em>str</em>) The embedding parallel mode. Defaults to SHARDING_ALONG_VOCAB.</p></li>
<li><p><strong>fast_build</strong> (<em>bool</em>) Enable fast build. Defaults to False.</p></li>
<li><p><strong>enable_build_cache</strong> (<em>Union</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.BuildCacheConfig" title="tensorrt_llm.llmapi.build_cache.BuildCacheConfig"><em>tensorrt_llm.llmapi.build_cache.BuildCacheConfig</em></a><em>, </em><em>bool</em><em>]</em>) Enable build cache. Defaults to False.</p></li>
<li><p><strong>peft_cache_config</strong> (<em>Optional</em><em>[</em><em>tensorrt_llm.llmapi.llm_args.PeftCacheConfig</em><em>]</em>) PEFT cache config. Defaults to None.</p></li>
<li><p><strong>scheduler_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.SchedulerConfig" title="tensorrt_llm.llmapi.llm_args.SchedulerConfig"><em>tensorrt_llm.llmapi.llm_args.SchedulerConfig</em></a><em>]</em>) Scheduler config. Defaults to None.</p></li>
<li><p><strong>cache_transceiver_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.CacheTransceiverConfig" title="tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig"><em>tensorrt_llm.llmapi.llm_args.CacheTransceiverConfig</em></a><em>]</em>) Cache transceiver config. Defaults to None.</p></li>
<li><p><strong>speculative_config</strong> (<em>Union</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.MedusaDecodingConfig" title="tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.EagleDecodingConfig" title="tensorrt_llm.llmapi.llm_args.EagleDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.EagleDecodingConfig</em></a><em>, </em><a class="reference internal" href="#tensorrt_llm.llmapi.MTPDecodingConfig" title="tensorrt_llm.llmapi.llm_args.MTPDecodingConfig"><em>tensorrt_llm.llmapi.llm_args.MTPDecodingConfig</em></a><em>, </em><em>NoneType</em><em>]</em>) Speculative decoding config. Defaults to None.</p></li>
<li><p><strong>batching_type</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.BatchingType" title="tensorrt_llm.llmapi.llm_args.BatchingType"><em>tensorrt_llm.llmapi.llm_args.BatchingType</em></a><em>]</em>) Batching type. Defaults to None.</p></li>
<li><p><strong>normalize_log_probs</strong> (<em>bool</em>) Normalize log probabilities. Defaults to False.</p></li>
<li><p><strong>gather_generation_logits</strong> (<em>bool</em>) Gather generation logits. Defaults to False.</p></li>
<li><p><strong>extended_runtime_perf_knob_config</strong> (<em>Optional</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig" title="tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig"><em>tensorrt_llm.llmapi.llm_args.ExtendedRuntimePerfKnobConfig</em></a><em>]</em>) Extended runtime perf knob config. Defaults to None.</p></li>
<li><p><strong>max_batch_size</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The maximum batch size. Defaults to None.</p></li>
<li><p><strong>max_input_len</strong> (<em>int</em>) The maximum input length. Defaults to 1024.</p></li>
<li><p><strong>max_seq_len</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The maximum sequence length. Defaults to None.</p></li>
<li><p><strong>max_beam_width</strong> (<em>int</em>) The maximum beam width. Defaults to 1.</p></li>
<li><p><strong>max_num_tokens</strong> (<em>Optional</em><em>[</em><em>int</em><em>]</em>) The maximum number of tokens. Defaults to None.</p></li>
<li><p><strong>backend</strong> (<em>Optional</em><em>[</em><em>str</em><em>]</em>) The backend to use. Defaults to None.</p></li>
<li><p><strong>kwargs</strong> (<em>Any</em>) Advanced arguments passed to <cite>LlmArgs</cite>.</p></li>
</ul>
</dd>
</dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.tokenizer">
<span class="sig-name descname"><span class="pre">tokenizer</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.tokenizer" title="Link to this definition">#</a></dt>
<dd><p>The tokenizer loaded by LLM instance, if any.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>tensorrt_llm.llmapi.tokenizer.TokenizerBase, optional</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.workspace">
<span class="sig-name descname"><span class="pre">workspace</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.workspace" title="Link to this definition">#</a></dt>
<dd><p>The directory to store intermediate files.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>pathlib.Path</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">PreTrainedTokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'auto'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'slow'</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">skip_tokenizer_init</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">trust_remote_code</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tensor_parallel_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">dtype</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'auto'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer_revision</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Any</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.generate">
<span class="sig-name descname"><span class="pre">generate</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">inputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TextPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokensPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TextPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokensPrompt</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sampling_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><span class="pre">SamplingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><span class="pre">SamplingParams</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_tqdm</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LoRARequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">LoRARequest</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_adapter_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PromptAdapterRequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><span class="pre">PromptAdapterRequest</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_retention_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig"><span class="pre">KvCacheRetentionConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig"><span class="pre">KvCacheRetentionConfig</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">disaggregated_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.DisaggregatedParams" title="tensorrt_llm.disaggregated_params.DisaggregatedParams"><span class="pre">DisaggregatedParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Sequence</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.llmapi.DisaggregatedParams" title="tensorrt_llm.disaggregated_params.DisaggregatedParams"><span class="pre">DisaggregatedParams</span></a><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.llm.RequestOutput"><span class="pre">RequestOutput</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.llm.RequestOutput"><span class="pre">RequestOutput</span></a><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.generate"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.generate" title="Link to this definition">#</a></dt>
<dd><p>Generate output for the given prompts in the synchronous mode.
Synchronous generation accepts either single prompt or batched prompts.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>inputs</strong> (<em>tensorrt_llm.inputs.data.PromptInputs</em><em>, </em><em>Sequence</em><em>[</em><em>tensorrt_llm.inputs.data.PromptInputs</em><em>]</em>) The prompt text or token ids.
It can be single prompt or batched prompts.</p></li>
<li><p><strong>sampling_params</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><em>tensorrt_llm.sampling_params.SamplingParams</em></a><em>, </em><em>List</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><em>tensorrt_llm.sampling_params.SamplingParams</em></a><em>]</em><em>, </em><em>optional</em>) The sampling params for the generation. Defaults to None.
A default one will be used if not provided.</p></li>
<li><p><strong>use_tqdm</strong> (<em>bool</em>) Whether to use tqdm to display the progress bar. Defaults to True.</p></li>
<li><p><strong>lora_request</strong> (<em>tensorrt_llm.executor.request.LoRARequest</em><em>, </em><em>Sequence</em><em>[</em><em>tensorrt_llm.executor.request.LoRARequest</em><em>]</em><em>, </em><em>optional</em>) LoRA request to use for generation, if any. Defaults to None.</p></li>
<li><p><strong>prompt_adapter_request</strong> (<em>tensorrt_llm.executor.request.PromptAdapterRequest</em><em>, </em><em>Sequence</em><em>[</em><em>tensorrt_llm.executor.request.PromptAdapterRequest</em><em>]</em><em>, </em><em>optional</em>) Prompt Adapter request to use for generation, if any. Defaults to None.</p></li>
<li><p><strong>kv_cache_retention_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig"><em>tensorrt_llm.bindings.executor.KvCacheRetentionConfig</em></a><em>, </em><em>Sequence</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig"><em>tensorrt_llm.bindings.executor.KvCacheRetentionConfig</em></a><em>]</em><em>, </em><em>optional</em>) Configuration for the requests retention in the KV Cache. Defaults to None.</p></li>
<li><p><strong>disaggregated_params</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.DisaggregatedParams" title="tensorrt_llm.disaggregated_params.DisaggregatedParams"><em>tensorrt_llm.disaggregated_params.DisaggregatedParams</em></a><em>, </em><em>Sequence</em><em>[</em><a class="reference internal" href="#tensorrt_llm.llmapi.DisaggregatedParams" title="tensorrt_llm.disaggregated_params.DisaggregatedParams"><em>tensorrt_llm.disaggregated_params.DisaggregatedParams</em></a><em>]</em><em>, </em><em>optional</em>) Disaggregated parameters. Defaults to None.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>The output data of the completion request to the LLM.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>Union[<a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.RequestOutput">tensorrt_llm.llmapi.RequestOutput</a>, List[<a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.RequestOutput">tensorrt_llm.llmapi.RequestOutput</a>]]</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.generate_async">
<span class="sig-name descname"><span class="pre">generate_async</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">inputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TextPrompt</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">TokensPrompt</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sampling_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><span class="pre">SamplingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LoRARequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_adapter_request</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PromptAdapterRequest</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">streaming</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_retention_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig"><span class="pre">KvCacheRetentionConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">disaggregated_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.DisaggregatedParams" title="tensorrt_llm.disaggregated_params.DisaggregatedParams"><span class="pre">DisaggregatedParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">_postproc_params</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">PostprocParams</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.llm.RequestOutput"><span class="pre">RequestOutput</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.generate_async"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.generate_async" title="Link to this definition">#</a></dt>
<dd><p>Generate output for the given prompt in the asynchronous mode.
Asynchronous generation accepts single prompt only.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>inputs</strong> (<em>tensorrt_llm.inputs.data.PromptInputs</em>) The prompt text or token ids; it must be single prompt.</p></li>
<li><p><strong>sampling_params</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.SamplingParams" title="tensorrt_llm.sampling_params.SamplingParams"><em>tensorrt_llm.sampling_params.SamplingParams</em></a><em>, </em><em>optional</em>) The sampling params for the generation. Defaults to None.
A default one will be used if not provided.</p></li>
<li><p><strong>lora_request</strong> (<em>tensorrt_llm.executor.request.LoRARequest</em><em>, </em><em>optional</em>) LoRA request to use for generation, if any. Defaults to None.</p></li>
<li><p><strong>prompt_adapter_request</strong> (<em>tensorrt_llm.executor.request.PromptAdapterRequest</em><em>, </em><em>optional</em>) Prompt Adapter request to use for generation, if any. Defaults to None.</p></li>
<li><p><strong>streaming</strong> (<em>bool</em>) Whether to use the streaming mode for the generation. Defaults to False.</p></li>
<li><p><strong>kv_cache_retention_config</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig"><em>tensorrt_llm.bindings.executor.KvCacheRetentionConfig</em></a><em>, </em><em>optional</em>) Configuration for the requests retention in the KV Cache. Defaults to None.</p></li>
<li><p><strong>disaggregated_params</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.DisaggregatedParams" title="tensorrt_llm.disaggregated_params.DisaggregatedParams"><em>tensorrt_llm.disaggregated_params.DisaggregatedParams</em></a><em>, </em><em>optional</em>) Disaggregated parameters. Defaults to None.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>The output data of the completion request to the LLM.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="#tensorrt_llm.llmapi.RequestOutput" title="tensorrt_llm.llmapi.RequestOutput">tensorrt_llm.llmapi.RequestOutput</a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.get_kv_cache_events">
<span class="sig-name descname"><span class="pre">get_kv_cache_events</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">timeout</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.get_kv_cache_events"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.get_kv_cache_events" title="Link to this definition">#</a></dt>
<dd><p>Get iteration KV events from the runtime.</p>
<dl class="simple">
<dt>KV events are used to track changes and operations within the KV Cache. Types of events:</dt><dd><ul class="simple">
<li><p>KVCacheCreatedData: Indicates the creation of cache blocks.</p></li>
<li><p>KVCacheStoredData: Represents a sequence of stored blocks.</p></li>
<li><p>KVCacheRemovedData: Contains the hashes of blocks that are being removed from the cache.</p></li>
<li><p>KVCacheUpdatedData: Captures updates to existing cache blocks.</p></li>
</ul>
</dd>
<dt>To enable KV events:</dt><dd><ul class="simple">
<li><p>set <cite>event_buffer_max_size</cite> to a positive integer in the <cite>KvCacheConfig</cite>.</p></li>
<li><p>set <cite>enable_block_reuse</cite> to True in the <cite>KvCacheConfig</cite>.</p></li>
</ul>
</dd>
</dl>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>timeout</strong> (<em>float</em><em>, </em><em>optional</em>) Max wait time in seconds when retrieving events from queue. Defaults to 2.</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>A list of runtime events as dict.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>List[dict]</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.get_kv_cache_events_async">
<span class="sig-name descname"><span class="pre">get_kv_cache_events_async</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">timeout</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">IterationResult</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.get_kv_cache_events_async"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.get_kv_cache_events_async" title="Link to this definition">#</a></dt>
<dd><p>Get iteration KV events from the runtime.</p>
<dl class="simple">
<dt>KV events are used to track changes and operations within the KV Cache. Types of events:</dt><dd><ul class="simple">
<li><p>KVCacheCreatedData: Indicates the creation of cache blocks.</p></li>
<li><p>KVCacheStoredData: Represents a sequence of stored blocks.</p></li>
<li><p>KVCacheRemovedData: Contains the hashes of blocks that are being removed from the cache.</p></li>
<li><p>KVCacheUpdatedData: Captures updates to existing cache blocks.</p></li>
</ul>
</dd>
<dt>To enable KV events:</dt><dd><ul class="simple">
<li><p>set <cite>event_buffer_max_size</cite> to a positive integer in the <cite>KvCacheConfig</cite>.</p></li>
<li><p>set <cite>enable_block_reuse</cite> to True in the <cite>KvCacheConfig</cite>.</p></li>
</ul>
</dd>
</dl>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>timeout</strong> (<em>float</em><em>, </em><em>optional</em>) Max wait time in seconds when retrieving events from queue. . Defaults to 2.</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>An async iterable object containing runtime events.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>tensorrt_llm.executor.result.IterationResult</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.get_stats">
<span class="sig-name descname"><span class="pre">get_stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">timeout</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.get_stats"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.get_stats" title="Link to this definition">#</a></dt>
<dd><p>Get iteration statistics from the runtime.
To collect statistics, call this function after prompts have been submitted with LLM().generate().</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>timeout</strong> (<em>float</em><em>, </em><em>optional</em>) Max wait time in seconds when retrieving stats from queue. Defaults to 2.</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p><dl class="simple">
<dt>A list of runtime stats as dict.</dt><dd><p>e.g., [{“cpuMemUsage”: …, “iter”: 0, …}, {“cpuMemUsage”: …, “iter”: 1, …}]</p>
</dd>
</dl>
</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>List[dict]</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.get_stats_async">
<span class="sig-name descname"><span class="pre">get_stats_async</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">timeout</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">IterationResult</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.get_stats_async"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.get_stats_async" title="Link to this definition">#</a></dt>
<dd><p>Get iteration statistics from the runtime.
To collect statistics, you can call this function in an async coroutine or the /metrics endpoint (if youre using trtllm-serve)
after prompts have been submitted.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>timeout</strong> (<em>float</em><em>, </em><em>optional</em>) Max wait time in seconds when retrieving stats from queue. Defaults to 2.</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>An async iterable object containing runtime stats.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>tensorrt_llm.executor.result.IterationResult</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.save">
<span class="sig-name descname"><span class="pre">save</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">engine_dir</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.save"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.save" title="Link to this definition">#</a></dt>
<dd><p>Save the built engine to the given path.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>engine_dir</strong> (<em>str</em>) The path to save the engine.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LLM.shutdown">
<span class="sig-name descname"><span class="pre">shutdown</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#LLM.shutdown"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LLM.shutdown" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id0">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">tokenizer</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">TokenizerBase</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#id0" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id1">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">workspace</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Path</span></em><a class="headerlink" href="#id1" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">CompletionOutput</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">index:</span> <span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">text:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">''</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">token_ids:</span> <span class="pre">~typing.List[int]</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cumulative_logprob:</span> <span class="pre">float</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logprobs:</span> <span class="pre">list[dict[int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">~tensorrt_llm.executor.result.Logprob]]</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_logprobs:</span> <span class="pre">list[dict[int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">~tensorrt_llm.executor.result.Logprob]]</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">finish_reason:</span> <span class="pre">~typing.Literal['stop'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">'length'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">'timeout'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">'cancelled']</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_reason:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">generation_logits:</span> <span class="pre">~torch.Tensor</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">disaggregated_params:</span> <span class="pre">~tensorrt_llm.disaggregated_params.DisaggregatedParams</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">_postprocess_result:</span> <span class="pre">~typing.Any</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/executor/result.html#CompletionOutput"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>The output data of one completion output of a request.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>index</strong> (<em>int</em>) The index of the output in the request.</p></li>
<li><p><strong>text</strong> (<em>str</em>) The generated output text. Defaults to “”.</p></li>
<li><p><strong>token_ids</strong> (<em>List</em><em>[</em><em>int</em><em>]</em><em>, </em><em>optional</em>) The token ids of the generated output text. Defaults to [].</p></li>
<li><p><strong>cumulative_logprob</strong> (<em>float</em><em>, </em><em>optional</em>) The cumulative log probability of the generated output text. Defaults to None.</p></li>
<li><p><strong>logprobs</strong> (<em>TokenLogprobs</em><em>, </em><em>optional</em>) The log probabilities of the top probability words at each position if the logprobs are requested. Defaults to None.</p></li>
<li><p><strong>prompt_logprobs</strong> (<em>TokenLogprobs</em><em>, </em><em>optional</em>) The log probabilities per prompt token. Defaults to None.</p></li>
<li><p><strong>finish_reason</strong> (<em>Literal</em><em>[</em><em>'stop'</em><em>, </em><em>'length'</em><em>, </em><em>'timeout'</em><em>, </em><em>'cancelled'</em><em>]</em><em>, </em><em>optional</em>) The reason why the sequence is finished. Defaults to None.</p></li>
<li><p><strong>stop_reason</strong> (<em>int</em><em>, </em><em>str</em><em>, </em><em>optional</em>) The stop string or token id that caused the completion to stop, None if the completion finished for some other reason. Defaults to None.</p></li>
<li><p><strong>generation_logits</strong> (<em>torch.Tensor</em><em>, </em><em>optional</em>) The logits on the generated output token ids. Defaults to None.</p></li>
<li><p><strong>disaggregated_params</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.DisaggregatedParams" title="tensorrt_llm.disaggregated_params.DisaggregatedParams"><em>tensorrt_llm.disaggregated_params.DisaggregatedParams</em></a><em>, </em><em>optional</em>) Parameters needed for disaggregated serving. Includes the type of request, the first generated tokens, the context request id and the any additional state needing to be transferred from context and generation instances. Defaults to None.</p></li>
</ul>
</dd>
</dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.length">
<span class="sig-name descname"><span class="pre">length</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.length" title="Link to this definition">#</a></dt>
<dd><p>The number of generated tokens.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>int</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.token_ids_diff">
<span class="sig-name descname"><span class="pre">token_ids_diff</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.token_ids_diff" title="Link to this definition">#</a></dt>
<dd><p>Newly generated token ids.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>List[int]</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.logprobs_diff">
<span class="sig-name descname"><span class="pre">logprobs_diff</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.logprobs_diff" title="Link to this definition">#</a></dt>
<dd><p>Logprobs of newly generated tokens.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>List[float]</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.text_diff">
<span class="sig-name descname"><span class="pre">text_diff</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.text_diff" title="Link to this definition">#</a></dt>
<dd><p>Newly generated tokens.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>str</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">index:</span> <span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">text:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">''</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">token_ids:</span> <span class="pre">~typing.List[int]</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cumulative_logprob:</span> <span class="pre">float</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logprobs:</span> <span class="pre">list[dict[int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">~tensorrt_llm.executor.result.Logprob]]</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_logprobs:</span> <span class="pre">list[dict[int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">~tensorrt_llm.executor.result.Logprob]]</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">finish_reason:</span> <span class="pre">~typing.Literal['stop'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">'length'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">'timeout'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">'cancelled']</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_reason:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">str</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">generation_logits:</span> <span class="pre">~torch.Tensor</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">disaggregated_params:</span> <span class="pre">~tensorrt_llm.disaggregated_params.DisaggregatedParams</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">_postprocess_result:</span> <span class="pre">~typing.Any</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.cumulative_logprob">
<span class="sig-name descname"><span class="pre">cumulative_logprob</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.cumulative_logprob" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.disaggregated_params">
<span class="sig-name descname"><span class="pre">disaggregated_params</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.llmapi.DisaggregatedParams" title="tensorrt_llm.disaggregated_params.DisaggregatedParams"><span class="pre">DisaggregatedParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.disaggregated_params" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.finish_reason">
<span class="sig-name descname"><span class="pre">finish_reason</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'stop'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'length'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'timeout'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'cancelled'</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.finish_reason" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.generation_logits">
<span class="sig-name descname"><span class="pre">generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.generation_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.index">
<span class="sig-name descname"><span class="pre">index</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.index" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id2">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#id2" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.logprobs">
<span class="sig-name descname"><span class="pre">logprobs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Logprob</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.logprobs" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id3">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">logprobs_diff</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span></em><a class="headerlink" href="#id3" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.prompt_logprobs">
<span class="sig-name descname"><span class="pre">prompt_logprobs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">list</span><span class="p"><span class="pre">[</span></span><span class="pre">dict</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">Logprob</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.prompt_logprobs" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.stop_reason">
<span class="sig-name descname"><span class="pre">stop_reason</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.stop_reason" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.text">
<span class="sig-name descname"><span class="pre">text</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.text" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id4">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">text_diff</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><a class="headerlink" href="#id4" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CompletionOutput.token_ids">
<span class="sig-name descname"><span class="pre">token_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CompletionOutput.token_ids" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id5">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">token_ids_diff</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span></em><a class="headerlink" href="#id5" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">RequestOutput</span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#RequestOutput"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">DetokenizedGenerationResultBase</span></code>, <code class="xref py py-class docutils literal notranslate"><span class="pre">GenerationResult</span></code></p>
<p>The output data of a completion request to the LLM.</p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput.request_id">
<span class="sig-name descname"><span class="pre">request_id</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput.request_id" title="Link to this definition">#</a></dt>
<dd><p>The unique ID of the request.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>int</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput.prompt">
<span class="sig-name descname"><span class="pre">prompt</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput.prompt" title="Link to this definition">#</a></dt>
<dd><p>The prompt string of the request.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>str, optional</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput.prompt_token_ids">
<span class="sig-name descname"><span class="pre">prompt_token_ids</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput.prompt_token_ids" title="Link to this definition">#</a></dt>
<dd><p>The token ids of the prompt.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>List[int]</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput.outputs">
<span class="sig-name descname"><span class="pre">outputs</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput.outputs" title="Link to this definition">#</a></dt>
<dd><p>The output sequences of the request.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>List[<a class="reference internal" href="#tensorrt_llm.llmapi.CompletionOutput" title="tensorrt_llm.llmapi.CompletionOutput">CompletionOutput</a>]</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput.context_logits">
<span class="sig-name descname"><span class="pre">context_logits</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput.context_logits" title="Link to this definition">#</a></dt>
<dd><p>The logits on the prompt token ids.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>torch.Tensor, optional</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput.finished">
<span class="sig-name descname"><span class="pre">finished</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput.finished" title="Link to this definition">#</a></dt>
<dd><p>Whether the whole request is finished.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>bool</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestOutput.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm.html#RequestOutput.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.RequestOutput.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id6">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">prompt</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#id6" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">GuidedDecodingParams</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">json</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">BaseModel</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">regex</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">grammar</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">json_object</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">structural_tag</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/sampling_params.html#GuidedDecodingParams"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Guided decoding parameters for text generation. Only one of the fields could be effective.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>json</strong> (<em>str</em><em>, </em><em>pydantic.main.BaseModel</em><em>, </em><em>dict</em><em>, </em><em>optional</em>) The generated text is amenable to json format with additional user-specified restrictions, namely schema. Defaults to None.</p></li>
<li><p><strong>regex</strong> (<em>str</em><em>, </em><em>optional</em>) The generated text is amenable to the user-specified regular expression. Defaults to None.</p></li>
<li><p><strong>grammar</strong> (<em>str</em><em>, </em><em>optional</em>) The generated text is amenable to the user-specified extended Backus-Naur form (EBNF) grammar. Defaults to None.</p></li>
<li><p><strong>json_object</strong> (<em>bool</em>) If True, the generated text is amenable to json format. Defaults to False.</p></li>
<li><p><strong>structural_tag</strong> (<em>str</em><em>, </em><em>optional</em>) The generated text is amenable to the user-specified structural tag. Defaults to None.</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">json</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">BaseModel</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">regex</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">grammar</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">json_object</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">structural_tag</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.grammar">
<span class="sig-name descname"><span class="pre">grammar</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.grammar" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.json">
<span class="sig-name descname"><span class="pre">json</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">BaseModel</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">dict</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.json" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.json_object">
<span class="sig-name descname"><span class="pre">json_object</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.json_object" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.regex">
<span class="sig-name descname"><span class="pre">regex</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.regex" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.GuidedDecodingParams.structural_tag">
<span class="sig-name descname"><span class="pre">structural_tag</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.GuidedDecodingParams.structural_tag" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">SamplingParams</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">end_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pad_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">32</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">include_stop_str_in_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">embedding_bias</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits_processor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LogitsProcessor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">LogitsProcessor</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">apply_batched_logits_processor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">n</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">best_of</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_beam_search</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p_min</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p_reset_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p_decay</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">temperature</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">min_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_search_diversity_rate</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">repetition_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">presence_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">frequency_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">length_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">early_stopping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">no_repeat_ngram_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">min_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width_array</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logprobs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_logprobs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_context_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">exclude_input_from_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_perf_metrics</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">additional_model_outputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">AdditionalModelOutput</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">_context_logits_auto_enabled</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">_generation_logits_auto_enabled</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">_return_log_probs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lookahead_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LookaheadDecodingConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">guided_decoding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="tensorrt_llm.sampling_params.GuidedDecodingParams"><span class="pre">GuidedDecodingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">ignore_eos</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">detokenize</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">add_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">truncate_prompt_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">skip_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">spaces_between_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/sampling_params.html#SamplingParams"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Sampling parameters for text generation.</p>
<p>Usage Examples:</p>
<blockquote>
<div><dl class="simple">
<dt>use_beam_search is False:</dt><dd><ul class="simple">
<li><p>best_of is None: (top-p/top-k) sampling n responses and return n generations</p></li>
<li><p>best_of is not None: (top-p/top-k) sampling best_of responses and return n generations (best_of &gt;= n must hold)</p></li>
</ul>
</dd>
<dt>use_beam_search is True:</dt><dd><ul class="simple">
<li><p>best_of is None: beam search with beam width of n, return n generations</p></li>
<li><p>best_of is not None: beam search with beam width of best_of, return n generations (best_of &gt;= n must hold)</p></li>
</ul>
</dd>
</dl>
</div></blockquote>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>end_id</strong> (<em>int</em><em>, </em><em>optional</em>) The end token id. Defaults to None.</p></li>
<li><p><strong>pad_id</strong> (<em>int</em><em>, </em><em>optional</em>) The pad token id. Defaults to None.</p></li>
<li><p><strong>max_tokens</strong> (<em>int</em>) The maximum number of tokens to generate. Defaults to 32.</p></li>
<li><p><strong>bad</strong> (<em>str</em><em>, </em><em>List</em><em>[</em><em>str</em><em>]</em><em>, </em><em>optional</em>) A string or a list of strings that redirect the generation when they are generated, so that the bad strings are excluded from the returned output. Defaults to None.</p></li>
<li><p><strong>bad_token_ids</strong> (<em>List</em><em>[</em><em>int</em><em>]</em><em>, </em><em>optional</em>) A list of token ids that redirect the generation when they are generated, so that the bad ids are excluded from the returned output. Defaults to None.</p></li>
<li><p><strong>stop</strong> (<em>str</em><em>, </em><em>List</em><em>[</em><em>str</em><em>]</em><em>, </em><em>optional</em>) A string or a list of strings that stop the generation when they are generated. The returned output will not contain the stop strings unless include_stop_str_in_output is True. Defaults to None.</p></li>
<li><p><strong>stop_token_ids</strong> (<em>List</em><em>[</em><em>int</em><em>]</em><em>, </em><em>optional</em>) A list of token ids that stop the generation when they are generated. Defaults to None.</p></li>
<li><p><strong>include_stop_str_in_output</strong> (<em>bool</em>) Whether to include the stop strings in output text. Defaults to False.</p></li>
<li><p><strong>embedding_bias</strong> (<em>torch.Tensor</em><em>, </em><em>optional</em>) The embedding bias tensor. Expected type is kFP32 and shape is [vocab_size]. Defaults to None.</p></li>
<li><p><strong>logits_processor</strong> (<em>tensorrt_llm.sampling_params.LogitsProcessor</em><em>, </em><em>List</em><em>[</em><em>tensorrt_llm.sampling_params.LogitsProcessor</em><em>]</em><em>, </em><em>optional</em>) The logits postprocessor callback(s). Defaults to None.
If a list, each processor is applied in order during generation (supported in PyTorch backend only).</p></li>
<li><p><strong>apply_batched_logits_processor</strong> (<em>bool</em>) Whether to apply batched logits postprocessor callback. Defaults to False.
The BatchedLogitsProcessor class is recommended for callback creation. The callback must be provided when initializing LLM.</p></li>
<li><p><strong>n</strong> (<em>int</em>) Number of sequences to generate. Defaults to 1.</p></li>
<li><p><strong>best_of</strong> (<em>int</em><em>, </em><em>optional</em>) Number of sequences to consider for best output. Defaults to None.</p></li>
<li><p><strong>use_beam_search</strong> (<em>bool</em>) Whether to use beam search. Defaults to False.</p></li>
<li><p><strong>top_k</strong> (<em>int</em><em>, </em><em>optional</em>) Controls number of logits to sample from. None means using C++ runtime default 0, i.e., all logits. Defaults to None.</p></li>
<li><p><strong>top_p</strong> (<em>float</em><em>, </em><em>optional</em>) Controls the top-P probability to sample from. None means using C++ runtime default 0.f. Defaults to None.</p></li>
<li><p><strong>top_p_min</strong> (<em>float</em><em>, </em><em>optional</em>) Controls decay in the top-P algorithm. topPMin is lower-bound. None means using C++ runtime default 1.e-6. Defaults to None.</p></li>
<li><p><strong>top_p_reset_ids</strong> (<em>int</em><em>, </em><em>optional</em>) Controls decay in the top-P algorithm. Indicates where to reset the decay. None means using C++ runtime default 1. Defaults to None.</p></li>
<li><p><strong>top_p_decay</strong> (<em>float</em><em>, </em><em>optional</em>) Controls decay in the top-P algorithm. The decay value. None means using C++ runtime default 1.f. Defaults to None.</p></li>
<li><p><strong>seed</strong> (<em>int</em><em>, </em><em>optional</em>) Controls the random seed used by the random number generator in sampling. None means using C++ runtime default 0. Defaults to None.</p></li>
<li><p><strong>temperature</strong> (<em>float</em><em>, </em><em>optional</em>) Controls the modulation of logits when sampling new tokens. It can have values &gt; 0.f. None means using C++ runtime default 1.0f. Defaults to None.</p></li>
<li><p><strong>min_tokens</strong> (<em>int</em><em>, </em><em>optional</em>) Lower bound on the number of tokens to generate. Values &lt; 1 have no effect. None means using C++ runtime default 1. Defaults to None.</p></li>
<li><p><strong>beam_search_diversity_rate</strong> (<em>float</em><em>, </em><em>optional</em>) Used to penalize tokens based on how often they appear in the sequence. It can have any value &gt; 0.f. Values &lt; 1.f encourages repetition, values &gt; 1.f discourages it. None means using C++ runtime default 1.f. Defaults to None.</p></li>
<li><p><strong>repetition_penalty</strong> (<em>float</em><em>, </em><em>optional</em>) Used to penalize tokens based on how often they appear in the sequence. It can have any value &gt; 0.f. Values &lt; 1.f encourages repetition, values &gt; 1.f discourages it. None means using C++ runtime default 1.f. Defaults to None.</p></li>
<li><p><strong>presence_penalty</strong> (<em>float</em><em>, </em><em>optional</em>) Used to penalize tokens already present in the sequence (irrespective of the number of appearances). It can have any values. Values &lt; 0.f encourage repetition, values &gt; 0.f discourage it. None means using C++ runtime default 0.f. Defaults to None.</p></li>
<li><p><strong>frequency_penalty</strong> (<em>float</em><em>, </em><em>optional</em>) Used to penalize tokens already present in the sequence (dependent on the number of appearances). It can have any values. Values &lt; 0.f encourage repetition, values &gt; 0.f discourage it. None means using C++ runtime default 0.f. Defaults to None.</p></li>
<li><p><strong>length_penalty</strong> (<em>float</em><em>, </em><em>optional</em>) Controls how to penalize longer sequences in beam search. None means using C++ runtime default 0.f. Defaults to None.</p></li>
<li><p><strong>early_stopping</strong> (<em>int</em><em>, </em><em>optional</em>) Controls whether the generation process finishes once beamWidth sentences are generated (ends with end_token). None means using C++ runtime default 1. Defaults to None.</p></li>
<li><p><strong>no_repeat_ngram_size</strong> (<em>int</em><em>, </em><em>optional</em>) Controls how many repeat ngram size are acceptable. None means using C++ runtime default 1 &lt;&lt; 30. Defaults to None.</p></li>
<li><p><strong>min_p</strong> (<em>float</em><em>, </em><em>optional</em>) scale the most likely token to determine the minimum token probability. None means using C++ runtime default 0.0. Defaults to None.</p></li>
<li><p><strong>beam_width_array</strong> (<em>List</em><em>[</em><em>int</em><em>]</em><em>, </em><em>optional</em>) The array of beam width using in Variable-Beam-Width-Search. Defaults to None.</p></li>
<li><p><strong>logprobs</strong> (<em>int</em><em>, </em><em>optional</em>) Number of log probabilities to return per output token. Defaults to None.</p></li>
<li><p><strong>prompt_logprobs</strong> (<em>int</em><em>, </em><em>optional</em>) Number of log probabilities to return per prompt token. Defaults to None.</p></li>
<li><p><strong>return_context_logits</strong> (<em>bool</em>) Controls if Result should contain the context logits. Defaults to False.</p></li>
<li><p><strong>return_generation_logits</strong> (<em>bool</em>) Controls if Result should contain the generation logits. Defaults to False.</p></li>
<li><p><strong>exclude_input_from_output</strong> (<em>bool</em>) Controls if output tokens in Result should include the input tokens. Defaults to True.</p></li>
<li><p><strong>return_encoder_output</strong> (<em>bool</em>) Controls if Result should contain encoder output hidden states (for encoder-only and encoder-decoder models). Defaults to False.</p></li>
<li><p><strong>return_perf_metrics</strong> (<em>bool</em>) Controls if Result should contain the performance metrics for this request. Defaults to False.</p></li>
<li><p><strong>additional_model_outputs</strong> (<em>List</em><em>[</em><em>tensorrt_llm.sampling_params.AdditionalModelOutput</em><em>]</em><em>, </em><em>optional</em>) The additional outputs to gather from the model. Defaults to None.</p></li>
<li><p><strong>lookahead_config</strong> (<em>tensorrt_llm.bindings.executor.LookaheadDecodingConfig</em><em> , </em><em>optional</em>) Lookahead decoding config. Defaults to None.</p></li>
<li><p><strong>guided_decoding</strong> (<a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="tensorrt_llm.sampling_params.GuidedDecodingParams"><em>tensorrt_llm.sampling_params.GuidedDecodingParams</em></a><em>, </em><em>optional</em>) Guided decoding params. Defaults to None.</p></li>
<li><p><strong>ignore_eos</strong> (<em>bool</em>) Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. Defaults to False.</p></li>
<li><p><strong>detokenize</strong> (<em>bool</em>) Whether to detokenize the output. Defaults to True.</p></li>
<li><p><strong>add_special_tokens</strong> (<em>bool</em>) Whether to add special tokens to the prompt. Defaults to True.</p></li>
<li><p><strong>truncate_prompt_tokens</strong> (<em>int</em><em>, </em><em>optional</em>) If set to an integer k, will use only the last k tokens from the prompt (i.e., left truncation). Defaults to None.</p></li>
<li><p><strong>skip_special_tokens</strong> (<em>bool</em>) Whether to skip special tokens in the output. Defaults to True.</p></li>
<li><p><strong>spaces_between_special_tokens</strong> (<em>bool</em>) Whether to add spaces between special tokens in the output. Defaults to True.</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">end_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pad_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">32</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">bad_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">stop_token_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">include_stop_str_in_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">embedding_bias</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logits_processor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LogitsProcessor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">LogitsProcessor</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">apply_batched_logits_processor</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">n</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">best_of</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_beam_search</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_k</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p_min</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p_reset_ids</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">top_p_decay</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">temperature</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">min_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_search_diversity_rate</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">repetition_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">presence_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">frequency_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">length_penalty</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">early_stopping</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">no_repeat_ngram_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">min_p</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">beam_width_array</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">logprobs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">prompt_logprobs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_context_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_generation_logits</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">exclude_input_from_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_encoder_output</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">return_perf_metrics</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">additional_model_outputs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">AdditionalModelOutput</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">_context_logits_auto_enabled</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">_generation_logits_auto_enabled</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">_return_log_probs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lookahead_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">LookaheadDecodingConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">guided_decoding</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="tensorrt_llm.sampling_params.GuidedDecodingParams"><span class="pre">GuidedDecodingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">ignore_eos</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">detokenize</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">add_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">truncate_prompt_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">skip_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">spaces_between_special_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.add_special_tokens">
<span class="sig-name descname"><span class="pre">add_special_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.add_special_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.additional_model_outputs">
<span class="sig-name descname"><span class="pre">additional_model_outputs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">AdditionalModelOutput</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.additional_model_outputs" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.apply_batched_logits_processor">
<span class="sig-name descname"><span class="pre">apply_batched_logits_processor</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.apply_batched_logits_processor" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.bad">
<span class="sig-name descname"><span class="pre">bad</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.bad" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.bad_token_ids">
<span class="sig-name descname"><span class="pre">bad_token_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.bad_token_ids" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.beam_search_diversity_rate">
<span class="sig-name descname"><span class="pre">beam_search_diversity_rate</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.beam_search_diversity_rate" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.beam_width_array">
<span class="sig-name descname"><span class="pre">beam_width_array</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.beam_width_array" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.best_of">
<span class="sig-name descname"><span class="pre">best_of</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.best_of" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.detokenize">
<span class="sig-name descname"><span class="pre">detokenize</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.detokenize" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.early_stopping">
<span class="sig-name descname"><span class="pre">early_stopping</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.early_stopping" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.embedding_bias">
<span class="sig-name descname"><span class="pre">embedding_bias</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Tensor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.embedding_bias" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.end_id">
<span class="sig-name descname"><span class="pre">end_id</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.end_id" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.exclude_input_from_output">
<span class="sig-name descname"><span class="pre">exclude_input_from_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.exclude_input_from_output" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.frequency_penalty">
<span class="sig-name descname"><span class="pre">frequency_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.frequency_penalty" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.guided_decoding">
<span class="sig-name descname"><span class="pre">guided_decoding</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.llmapi.GuidedDecodingParams" title="tensorrt_llm.sampling_params.GuidedDecodingParams"><span class="pre">GuidedDecodingParams</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.guided_decoding" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.ignore_eos">
<span class="sig-name descname"><span class="pre">ignore_eos</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.ignore_eos" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.include_stop_str_in_output">
<span class="sig-name descname"><span class="pre">include_stop_str_in_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.include_stop_str_in_output" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.length_penalty">
<span class="sig-name descname"><span class="pre">length_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.length_penalty" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.logits_processor">
<span class="sig-name descname"><span class="pre">logits_processor</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">LogitsProcessor</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">LogitsProcessor</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.logits_processor" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.logprobs">
<span class="sig-name descname"><span class="pre">logprobs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.logprobs" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.lookahead_config">
<span class="sig-name descname"><span class="pre">lookahead_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">LookaheadDecodingConfig</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.lookahead_config" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.max_tokens">
<span class="sig-name descname"><span class="pre">max_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.max_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.min_p">
<span class="sig-name descname"><span class="pre">min_p</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.min_p" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.min_tokens">
<span class="sig-name descname"><span class="pre">min_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.min_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.n">
<span class="sig-name descname"><span class="pre">n</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.n" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.no_repeat_ngram_size">
<span class="sig-name descname"><span class="pre">no_repeat_ngram_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.no_repeat_ngram_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.pad_id">
<span class="sig-name descname"><span class="pre">pad_id</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.pad_id" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.presence_penalty">
<span class="sig-name descname"><span class="pre">presence_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.presence_penalty" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.prompt_logprobs">
<span class="sig-name descname"><span class="pre">prompt_logprobs</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.prompt_logprobs" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.repetition_penalty">
<span class="sig-name descname"><span class="pre">repetition_penalty</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.repetition_penalty" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.return_context_logits">
<span class="sig-name descname"><span class="pre">return_context_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.return_context_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.return_encoder_output">
<span class="sig-name descname"><span class="pre">return_encoder_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.return_encoder_output" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.return_generation_logits">
<span class="sig-name descname"><span class="pre">return_generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.return_generation_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.return_perf_metrics">
<span class="sig-name descname"><span class="pre">return_perf_metrics</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.return_perf_metrics" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.seed">
<span class="sig-name descname"><span class="pre">seed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.seed" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.skip_special_tokens">
<span class="sig-name descname"><span class="pre">skip_special_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.skip_special_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.spaces_between_special_tokens">
<span class="sig-name descname"><span class="pre">spaces_between_special_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.spaces_between_special_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.stop">
<span class="sig-name descname"><span class="pre">stop</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.stop" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.stop_token_ids">
<span class="sig-name descname"><span class="pre">stop_token_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.stop_token_ids" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.temperature">
<span class="sig-name descname"><span class="pre">temperature</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.temperature" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_k">
<span class="sig-name descname"><span class="pre">top_k</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_k" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_p">
<span class="sig-name descname"><span class="pre">top_p</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_p" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_p_decay">
<span class="sig-name descname"><span class="pre">top_p_decay</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_p_decay" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_p_min">
<span class="sig-name descname"><span class="pre">top_p_min</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_p_min" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.top_p_reset_ids">
<span class="sig-name descname"><span class="pre">top_p_reset_ids</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.top_p_reset_ids" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.truncate_prompt_tokens">
<span class="sig-name descname"><span class="pre">truncate_prompt_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.truncate_prompt_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SamplingParams.use_beam_search">
<span class="sig-name descname"><span class="pre">use_beam_search</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SamplingParams.use_beam_search" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DisaggregatedParams">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">DisaggregatedParams</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">request_type</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">first_gen_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">ctx_request_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">opaque_state</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bytes</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">draft_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/disaggregated_params.html#DisaggregatedParams"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.DisaggregatedParams" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Disaggregated seving parameters</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>request_type</strong> (<em>str</em>) The type of request (“context_only” or “generation_only”)</p></li>
<li><p><strong>first_gen_tokens</strong> (<em>List</em><em>[</em><em>int</em><em>]</em>) The first tokens of the generation request</p></li>
<li><p><strong>ctx_request_id</strong> (<em>int</em>) The context request id</p></li>
<li><p><strong>opaque_state</strong> (<em>bytes</em>) Any additional state needing to be exchanged between context and gen instances</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DisaggregatedParams.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">request_type</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">first_gen_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">ctx_request_id</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">opaque_state</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bytes</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">draft_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.DisaggregatedParams.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DisaggregatedParams.ctx_request_id">
<span class="sig-name descname"><span class="pre">ctx_request_id</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.DisaggregatedParams.ctx_request_id" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DisaggregatedParams.draft_tokens">
<span class="sig-name descname"><span class="pre">draft_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.DisaggregatedParams.draft_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DisaggregatedParams.first_gen_tokens">
<span class="sig-name descname"><span class="pre">first_gen_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.DisaggregatedParams.first_gen_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DisaggregatedParams.get_context_phase_params">
<span class="sig-name descname"><span class="pre">get_context_phase_params</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">ContextPhaseParams</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/disaggregated_params.html#DisaggregatedParams.get_context_phase_params"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.DisaggregatedParams.get_context_phase_params" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DisaggregatedParams.get_request_type">
<span class="sig-name descname"><span class="pre">get_request_type</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">RequestType</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/disaggregated_params.html#DisaggregatedParams.get_request_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.DisaggregatedParams.get_request_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DisaggregatedParams.opaque_state">
<span class="sig-name descname"><span class="pre">opaque_state</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bytes</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.DisaggregatedParams.opaque_state" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DisaggregatedParams.request_type">
<span class="sig-name descname"><span class="pre">request_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.DisaggregatedParams.request_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">KvCacheConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_block_reuse</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_attention_window</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">sink_token_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">free_gpu_memory_fraction</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">host_cache_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">onboard_blocks</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cross_kv_cache_fraction</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">secondary_offload_min_priority</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">event_buffer_max_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_partial_reuse</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">copy_on_partial_reuse</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#KvCacheConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">BaseModel</span></code>, <code class="xref py py-class docutils literal notranslate"><span class="pre">PybindMirror</span></code></p>
<p>Configuration for the KV cache.</p>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.copy_on_partial_reuse">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">copy_on_partial_reuse</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.copy_on_partial_reuse" title="Link to this definition">#</a></dt>
<dd><p>Whether partially matched blocks that are in use can be reused after copying them.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.cross_kv_cache_fraction">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cross_kv_cache_fraction</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.cross_kv_cache_fraction" title="Link to this definition">#</a></dt>
<dd><p>The fraction of the KV Cache memory should be reserved for cross attention. If set to p, self attention will use 1-p of KV Cache memory and cross attention will use p of KV Cache memory. Default is 50%. Should only be set when using encoder-decoder model.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.enable_block_reuse">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_block_reuse</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.enable_block_reuse" title="Link to this definition">#</a></dt>
<dd><p>Controls if KV cache blocks can be reused for different requests.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.enable_partial_reuse">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_partial_reuse</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.enable_partial_reuse" title="Link to this definition">#</a></dt>
<dd><p>Whether blocks that are only partially matched can be reused.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.event_buffer_max_size">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">event_buffer_max_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.event_buffer_max_size" title="Link to this definition">#</a></dt>
<dd><p>Maximum size of the event buffer. If set to 0, the event buffer will not be used.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.free_gpu_memory_fraction">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">free_gpu_memory_fraction</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.free_gpu_memory_fraction" title="Link to this definition">#</a></dt>
<dd><p>The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both <cite>max_tokens</cite> and <cite>free_gpu_memory_fraction</cite> are specified, memory corresponding to the minimum will be used.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.host_cache_size">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">host_cache_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.host_cache_size" title="Link to this definition">#</a></dt>
<dd><p>Size of the host cache in bytes. If both <cite>max_tokens</cite> and <cite>host_cache_size</cite> are specified, memory corresponding to the minimum will be used.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.max_attention_window">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_attention_window</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.max_attention_window" title="Link to this definition">#</a></dt>
<dd><p>Size of the attention window for each sequence. Only the last tokens will be stored in the KV cache. If the number of elements in <cite>max_attention_window</cite> is less than the number of layers, <cite>max_attention_window</cite> will be repeated multiple times to the number of layers.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.max_tokens">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.max_tokens" title="Link to this definition">#</a></dt>
<dd><p>The maximum number of tokens that should be stored in the KV cache. If both <cite>max_tokens</cite> and <cite>free_gpu_memory_fraction</cite> are specified, memory corresponding to the minimum will be used.</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.onboard_blocks">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">onboard_blocks</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.onboard_blocks" title="Link to this definition">#</a></dt>
<dd><p>Controls if blocks are onboarded.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.secondary_offload_min_priority">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">secondary_offload_min_priority</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.secondary_offload_min_priority" title="Link to this definition">#</a></dt>
<dd><p>Only blocks with priority &gt; mSecondaryOfflineMinPriority can be offloaded to secondary memory.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheConfig.sink_token_length">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">sink_token_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheConfig.sink_token_length" title="Link to this definition">#</a></dt>
<dd><p>Number of sink tokens (tokens to always keep in attention window).</p>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">KvCacheRetentionConfig</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pybind11_object</span></code></p>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">TokenRangeRetentionConfig</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">pybind11_object</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig.TokenRangeRetentionConfig"><span class="pre">tensorrt_llm.bindings.executor.KvCacheRetentionConfig.TokenRangeRetentionConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">token_start</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">token_end</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">priority</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">duration_ms</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">datetime.timedelta</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.duration_ms">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">duration_ms</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.duration_ms" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.priority">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">priority</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.priority" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_end">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">token_end</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_end" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_start">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">token_start</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_start" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">self</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig"><span class="pre">tensorrt_llm.bindings.executor.KvCacheRetentionConfig</span></a></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">token_range_retention_configs</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">list</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig" title="tensorrt_llm.bindings.executor.KvCacheRetentionConfig.TokenRangeRetentionConfig"><span class="pre">tensorrt_llm.bindings.executor.KvCacheRetentionConfig.TokenRangeRetentionConfig</span></a><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">decode_retention_priority</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">35</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">decode_duration_ms</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">datetime.timedelta</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_duration_ms">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">decode_duration_ms</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_duration_ms" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_retention_priority">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">decode_retention_priority</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_retention_priority" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.KvCacheRetentionConfig.token_range_retention_configs">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">token_range_retention_configs</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.token_range_retention_configs" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">LookaheadDecodingConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_draft_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">speculative_model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_window_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">4</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_ngram_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">3</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_verification_set_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">4</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#LookaheadDecodingConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">DecodingBaseConfig</span></code>, <code class="xref py py-class docutils literal notranslate"><span class="pre">PybindMirror</span></code></p>
<p>Configuration for lookahead speculative decoding.</p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">data</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#LookaheadDecodingConfig.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.__init__" title="Link to this definition">#</a></dt>
<dd><p>Create a new model by parsing and validating input data from keyword arguments.</p>
<p>Raises [<cite>ValidationError</cite>][pydantic_core.ValidationError] if the input data cannot be
validated to form a valid model.</p>
<p><cite>self</cite> is explicitly positional-only to allow <cite>self</cite> as a field name.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.calculate_speculative_resource">
<span class="sig-name descname"><span class="pre">calculate_speculative_resource</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#LookaheadDecodingConfig.calculate_speculative_resource"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.calculate_speculative_resource" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.decoding_type">
<span class="sig-name descname"><span class="pre">decoding_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'Lookahead'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.decoding_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#LookaheadDecodingConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.from_dict" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_ngram_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">3</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size" title="Link to this definition">#</a></dt>
<dd><p>Number of tokens per NGram.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_verification_set_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">4</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size" title="Link to this definition">#</a></dt>
<dd><p>Number of NGrams in verification branch per step.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_window_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">4</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size" title="Link to this definition">#</a></dt>
<dd><p>Number of NGrams in lookahead branch per step.</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">validate_positive_values</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">v</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#LookaheadDecodingConfig.validate_positive_values"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">MedusaDecodingConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_draft_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">speculative_model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">medusa_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_medusa_heads</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#MedusaDecodingConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">DecodingBaseConfig</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig.decoding_type">
<span class="sig-name descname"><span class="pre">decoding_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'Medusa'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.decoding_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#MedusaDecodingConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.from_dict" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig.medusa_choices">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">medusa_choices</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.medusa_choices" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MedusaDecodingConfig.num_medusa_heads">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_medusa_heads</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.num_medusa_heads" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">EagleDecodingConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_draft_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">speculative_model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">eagle_choices</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">greedy_sampling</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">posterior_threshold</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_dynamic_tree</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">dynamic_tree_max_topK</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_eagle_layers</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_non_leaves_per_layer</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pytorch_eagle_weights_path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#EagleDecodingConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">DecodingBaseConfig</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.decoding_type">
<span class="sig-name descname"><span class="pre">decoding_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'Eagle'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.decoding_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.dynamic_tree_max_topK">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">dynamic_tree_max_topK</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.dynamic_tree_max_topK" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.eagle_choices">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">eagle_choices</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">int</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.eagle_choices" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#EagleDecodingConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.from_dict" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.greedy_sampling">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">greedy_sampling</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.greedy_sampling" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.max_non_leaves_per_layer">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_non_leaves_per_layer</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.max_non_leaves_per_layer" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.num_eagle_layers">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_eagle_layers</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.num_eagle_layers" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.posterior_threshold">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">posterior_threshold</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.posterior_threshold" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.pytorch_eagle_weights_path">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">pytorch_eagle_weights_path</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.pytorch_eagle_weights_path" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.EagleDecodingConfig.use_dynamic_tree">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_dynamic_tree</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.EagleDecodingConfig.use_dynamic_tree" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MTPDecodingConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">MTPDecodingConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_draft_len</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">speculative_model</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">num_nextn_predict_layers</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_relaxed_acceptance_for_thinking</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">relaxed_topk</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">relaxed_delta</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.0</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#MTPDecodingConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MTPDecodingConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">DecodingBaseConfig</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MTPDecodingConfig.decoding_type">
<span class="sig-name descname"><span class="pre">decoding_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'MTP'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MTPDecodingConfig.decoding_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MTPDecodingConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#MTPDecodingConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MTPDecodingConfig.from_dict" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MTPDecodingConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MTPDecodingConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MTPDecodingConfig.num_nextn_predict_layers">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">num_nextn_predict_layers</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MTPDecodingConfig.num_nextn_predict_layers" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_delta">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">relaxed_delta</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.0</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_delta" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_topk">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">relaxed_topk</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_topk" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MTPDecodingConfig.use_relaxed_acceptance_for_thinking">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">use_relaxed_acceptance_for_thinking</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.MTPDecodingConfig.use_relaxed_acceptance_for_thinking" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">SchedulerConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">capacity_scheduler_policy</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy" title="tensorrt_llm.llmapi.llm_args.CapacitySchedulerPolicy"><span class="pre">CapacitySchedulerPolicy</span></a></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">CapacitySchedulerPolicy.GUARANTEED_NO_EVICT</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">context_chunking_policy</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.ContextChunkingPolicy" title="tensorrt_llm.llmapi.llm_args.ContextChunkingPolicy"><span class="pre">ContextChunkingPolicy</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">dynamic_batch_config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#tensorrt_llm.llmapi.DynamicBatchConfig" title="tensorrt_llm.llmapi.llm_args.DynamicBatchConfig"><span class="pre">DynamicBatchConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#SchedulerConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">BaseModel</span></code>, <code class="xref py py-class docutils literal notranslate"><span class="pre">PybindMirror</span></code></p>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig.capacity_scheduler_policy">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">capacity_scheduler_policy</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy" title="tensorrt_llm.llmapi.llm_args.CapacitySchedulerPolicy"><span class="pre">CapacitySchedulerPolicy</span></a></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">CapacitySchedulerPolicy.GUARANTEED_NO_EVICT</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig.capacity_scheduler_policy" title="Link to this definition">#</a></dt>
<dd><p>The capacity scheduler policy to use</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig.context_chunking_policy">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">context_chunking_policy</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.llmapi.ContextChunkingPolicy" title="tensorrt_llm.llmapi.llm_args.ContextChunkingPolicy"><span class="pre">ContextChunkingPolicy</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig.context_chunking_policy" title="Link to this definition">#</a></dt>
<dd><p>The context chunking policy to use</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig.dynamic_batch_config">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">dynamic_batch_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="#tensorrt_llm.llmapi.DynamicBatchConfig" title="tensorrt_llm.llmapi.llm_args.DynamicBatchConfig"><span class="pre">DynamicBatchConfig</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig.dynamic_batch_config" title="Link to this definition">#</a></dt>
<dd><p>The dynamic batch config to use</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.SchedulerConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.SchedulerConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">CapacitySchedulerPolicy</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">value</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">names=&lt;not</span> <span class="pre">given&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">*values</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">module=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">qualname=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">type=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">start=1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">boundary=None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#CapacitySchedulerPolicy"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">StrEnum</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT">
<span class="sig-name descname"><span class="pre">GUARANTEED_NO_EVICT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'GUARANTEED_NO_EVICT'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy.MAX_UTILIZATION">
<span class="sig-name descname"><span class="pre">MAX_UTILIZATION</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'MAX_UTILIZATION'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.MAX_UTILIZATION" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CapacitySchedulerPolicy.STATIC_BATCH">
<span class="sig-name descname"><span class="pre">STATIC_BATCH</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'STATIC_BATCH'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.STATIC_BATCH" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">BuildConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">max_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_seq_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">opt_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">2048</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_beam_width:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_num_tokens:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8192</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">opt_num_tokens:</span> <span class="pre">Optional[int]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_prompt_embedding_table_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_type:</span> <span class="pre">tensorrt_llm.bindings.KVCacheType</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gather_context_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gather_generation_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">strongly_typed:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">force_num_profiles:</span> <span class="pre">Optional[int]</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">profiling_verbosity:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'layer_names_only'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_debug_output:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_draft_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">speculative_decoding_mode:</span> <span class="pre">tensorrt_llm.models.modeling_utils.SpeculativeDecodingMode</span> <span class="pre">=</span> <span class="pre">&lt;SpeculativeDecodingMode.NONE:</span> <span class="pre">1&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_refit:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'model.cache'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_config:</span> <span class="pre">tensorrt_llm.lora_manager.LoraConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">auto_parallel_config:</span> <span class="pre">tensorrt_llm.auto_parallel.config.AutoParallelConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">weight_sparsity:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">weight_streaming:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">plugin_config:</span> <span class="pre">tensorrt_llm.plugin.plugin.PluginConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_strip_plan:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_encoder_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">dry_run:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">visualize_network:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">monitor_memory:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_mrope:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">max_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_seq_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">opt_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_batch_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">2048</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_beam_width:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_num_tokens:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">8192</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">opt_num_tokens:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_prompt_embedding_table_size:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_type:</span> <span class="pre">~tensorrt_llm.bindings.KVCacheType</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gather_context_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">gather_generation_logits:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">strongly_typed:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">force_num_profiles:</span> <span class="pre">int</span> <span class="pre">|</span> <span class="pre">None</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">profiling_verbosity:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'layer_names_only'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_debug_output:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_draft_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">0</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">speculative_decoding_mode:</span> <span class="pre">~tensorrt_llm.models.modeling_utils.SpeculativeDecodingMode</span> <span class="pre">=</span> <span class="pre">&lt;SpeculativeDecodingMode.NONE:</span> <span class="pre">1&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_refit:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">input_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">output_timing_cache:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">'model.cache'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">lora_config:</span> <span class="pre">~tensorrt_llm.lora_manager.LoraConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">auto_parallel_config:</span> <span class="pre">~tensorrt_llm.auto_parallel.config.AutoParallelConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">weight_sparsity:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">weight_streaming:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">plugin_config:</span> <span class="pre">~tensorrt_llm.plugin.plugin.PluginConfig</span> <span class="pre">=</span> <span class="pre">&lt;factory&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_strip_plan:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_encoder_input_len:</span> <span class="pre">int</span> <span class="pre">=</span> <span class="pre">1024</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">dry_run:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">visualize_network:</span> <span class="pre">str</span> <span class="pre">=</span> <span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">monitor_memory:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_mrope:</span> <span class="pre">bool</span> <span class="pre">=</span> <span class="pre">False</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.auto_parallel_config">
<span class="sig-name descname"><span class="pre">auto_parallel_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">AutoParallelConfig</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.auto_parallel_config" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.dry_run">
<span class="sig-name descname"><span class="pre">dry_run</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.dry_run" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.enable_debug_output">
<span class="sig-name descname"><span class="pre">enable_debug_output</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.enable_debug_output" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.force_num_profiles">
<span class="sig-name descname"><span class="pre">force_num_profiles</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.force_num_profiles" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.from_dict" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.from_json_file">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_json_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">plugin_config</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.from_json_file"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.from_json_file" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.gather_context_logits">
<span class="sig-name descname"><span class="pre">gather_context_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.gather_context_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.gather_generation_logits">
<span class="sig-name descname"><span class="pre">gather_generation_logits</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.gather_generation_logits" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.input_timing_cache">
<span class="sig-name descname"><span class="pre">input_timing_cache</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.input_timing_cache" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.kv_cache_type">
<span class="sig-name descname"><span class="pre">kv_cache_type</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">KVCacheType</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.kv_cache_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.lora_config">
<span class="sig-name descname"><span class="pre">lora_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">LoraConfig</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.lora_config" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_batch_size">
<span class="sig-name descname"><span class="pre">max_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">2048</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_batch_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_beam_width">
<span class="sig-name descname"><span class="pre">max_beam_width</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_beam_width" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_draft_len">
<span class="sig-name descname"><span class="pre">max_draft_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_draft_len" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_encoder_input_len">
<span class="sig-name descname"><span class="pre">max_encoder_input_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1024</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_encoder_input_len" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_input_len">
<span class="sig-name descname"><span class="pre">max_input_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1024</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_input_len" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_num_tokens">
<span class="sig-name descname"><span class="pre">max_num_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">8192</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_num_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_prompt_embedding_table_size">
<span class="sig-name descname"><span class="pre">max_prompt_embedding_table_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_prompt_embedding_table_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.max_seq_len">
<span class="sig-name descname"><span class="pre">max_seq_len</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.max_seq_len" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.monitor_memory">
<span class="sig-name descname"><span class="pre">monitor_memory</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.monitor_memory" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.opt_batch_size">
<span class="sig-name descname"><span class="pre">opt_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">8</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.opt_batch_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.opt_num_tokens">
<span class="sig-name descname"><span class="pre">opt_num_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.opt_num_tokens" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.output_timing_cache">
<span class="sig-name descname"><span class="pre">output_timing_cache</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'model.cache'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.output_timing_cache" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.plugin_config">
<span class="sig-name descname"><span class="pre">plugin_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html#tensorrt_llm.plugin.PluginConfig" title="tensorrt_llm.plugin.plugin.PluginConfig"><span class="pre">PluginConfig</span></a></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.plugin_config" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.profiling_verbosity">
<span class="sig-name descname"><span class="pre">profiling_verbosity</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'layer_names_only'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.profiling_verbosity" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.speculative_decoding_mode">
<span class="sig-name descname"><span class="pre">speculative_decoding_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.models.html#tensorrt_llm.models.SpeculativeDecodingMode" title="tensorrt_llm.models.modeling_utils.SpeculativeDecodingMode"><span class="pre">SpeculativeDecodingMode</span></a></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.speculative_decoding_mode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.strongly_typed">
<span class="sig-name descname"><span class="pre">strongly_typed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.strongly_typed" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.to_dict">
<span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.to_dict" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.update">
<span class="sig-name descname"><span class="pre">update</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.update"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.update" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.update_from_dict">
<span class="sig-name descname"><span class="pre">update_from_dict</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.update_from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.update_from_dict" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.update_kv_cache_type">
<span class="sig-name descname"><span class="pre">update_kv_cache_type</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">model_architecture</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/builder.html#BuildConfig.update_kv_cache_type"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.update_kv_cache_type" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.use_mrope">
<span class="sig-name descname"><span class="pre">use_mrope</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.use_mrope" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.use_refit">
<span class="sig-name descname"><span class="pre">use_refit</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.use_refit" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.use_strip_plan">
<span class="sig-name descname"><span class="pre">use_strip_plan</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.use_strip_plan" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.visualize_network">
<span class="sig-name descname"><span class="pre">visualize_network</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.visualize_network" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.weight_sparsity">
<span class="sig-name descname"><span class="pre">weight_sparsity</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.weight_sparsity" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildConfig.weight_streaming">
<span class="sig-name descname"><span class="pre">weight_streaming</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BuildConfig.weight_streaming" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">QuantConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">group_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">128</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">smoothquant_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.5</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">clamp_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_meta_recipe</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">has_zero_point</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pre_quant_scale</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">exclude_modules</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Serializable quantization configuration class, part of the PretrainedConfig.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>quant_algo</strong> (<a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><em>tensorrt_llm.quantization.mode.QuantAlgo</em></a><em>, </em><em>optional</em>) Quantization algorithm. Defaults to None.</p></li>
<li><p><strong>kv_cache_quant_algo</strong> (<a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><em>tensorrt_llm.quantization.mode.QuantAlgo</em></a><em>, </em><em>optional</em>) KV cache quantization algorithm. Defaults to None.</p></li>
<li><p><strong>group_size</strong> (<em>int</em>) The group size for group-wise quantization. Defaults to 128.</p></li>
<li><p><strong>smoothquant_val</strong> (<em>float</em>) The smoothing parameter alpha used in smooth quant. Defaults to 0.5.</p></li>
<li><p><strong>clamp_val</strong> (<em>List</em><em>[</em><em>float</em><em>]</em><em>, </em><em>optional</em>) The clamp values used in FP8 rowwise quantization. Defaults to None.</p></li>
<li><p><strong>use_meta_recipe</strong> (<em>bool</em>) Whether to use Metas recipe for FP8 rowwise quantization. Defaults to False.</p></li>
<li><p><strong>has_zero_point</strong> (<em>bool</em>) Whether to use zero point for quantization. Defaults to False.</p></li>
<li><p><strong>pre_quant_scale</strong> (<em>bool</em>) Whether to use pre-quant scale for quantization. Defaults to False.</p></li>
<li><p><strong>exclude_modules</strong> (<em>List</em><em>[</em><em>str</em><em>]</em><em>, </em><em>optional</em>) The module name patterns that are skipped in quantization. Defaults to None.</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">kv_cache_quant_algo</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">group_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">128</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">smoothquant_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0.5</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">clamp_val</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">use_meta_recipe</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">has_zero_point</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">pre_quant_scale</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">exclude_modules</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">None</span></span></span><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.clamp_val">
<span class="sig-name descname"><span class="pre">clamp_val</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">float</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.clamp_val" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.exclude_modules">
<span class="sig-name descname"><span class="pre">exclude_modules</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">str</span><span class="p"><span class="pre">]</span></span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.exclude_modules" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig" title="tensorrt_llm.models.modeling_utils.QuantConfig"><span class="pre">QuantConfig</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.from_dict" title="Link to this definition">#</a></dt>
<dd><p>Create a QuantConfig instance from a dict.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>config</strong> (<em>dict</em>) The dict used to create QuantConfig.</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>The QuantConfig created from dict.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="#tensorrt_llm.llmapi.QuantConfig" title="tensorrt_llm.models.modeling_utils.QuantConfig">tensorrt_llm.models.modeling_utils.QuantConfig</a></p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.group_size">
<span class="sig-name descname"><span class="pre">group_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">128</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.group_size" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.has_zero_point">
<span class="sig-name descname"><span class="pre">has_zero_point</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.has_zero_point" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.is_module_excluded_from_quantization">
<span class="sig-name descname"><span class="pre">is_module_excluded_from_quantization</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">bool</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.is_module_excluded_from_quantization"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.is_module_excluded_from_quantization" title="Link to this definition">#</a></dt>
<dd><p>Check if the module is excluded from quantization.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>name</strong> (<em>str</em>) The name of the module.</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>True if the module is excluded from quantization, False otherwise.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p>bool</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.kv_cache_quant_algo">
<span class="sig-name descname"><span class="pre">kv_cache_quant_algo</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.kv_cache_quant_algo" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.layer_quant_mode">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">layer_quant_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantMode" title="tensorrt_llm.quantization.mode.QuantMode"><span class="pre">QuantMode</span></a></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.layer_quant_mode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.pre_quant_scale">
<span class="sig-name descname"><span class="pre">pre_quant_scale</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.pre_quant_scale" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.quant_algo">
<span class="sig-name descname"><span class="pre">quant_algo</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html#tensorrt_llm.quantization.QuantAlgo" title="tensorrt_llm.quantization.mode.QuantAlgo"><span class="pre">QuantAlgo</span></a><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.quant_algo" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.quant_mode">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">quant_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">QuantModeWrapper</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.quant_mode" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.smoothquant_val">
<span class="sig-name descname"><span class="pre">smoothquant_val</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0.5</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.smoothquant_val" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.to_dict">
<span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/models/modeling_utils.html#QuantConfig.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.to_dict" title="Link to this definition">#</a></dt>
<dd><p>Dump a QuantConfig instance to a dict.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>The dict dumped from QuantConfig.</p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p>dict</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantConfig.use_meta_recipe">
<span class="sig-name descname"><span class="pre">use_meta_recipe</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantConfig.use_meta_recipe" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">QuantAlgo</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">value</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">names=&lt;not</span> <span class="pre">given&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">*values</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">module=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">qualname=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">type=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">start=1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">boundary=None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/quantization/mode.html#QuantAlgo"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">StrEnum</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.FP8">
<span class="sig-name descname"><span class="pre">FP8</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'FP8'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.FP8" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.FP8_BLOCK_SCALES">
<span class="sig-name descname"><span class="pre">FP8_BLOCK_SCALES</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'FP8_BLOCK_SCALES'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.FP8_BLOCK_SCALES" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN">
<span class="sig-name descname"><span class="pre">FP8_PER_CHANNEL_PER_TOKEN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'FP8_PER_CHANNEL_PER_TOKEN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.INT8">
<span class="sig-name descname"><span class="pre">INT8</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'INT8'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.INT8" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.MIXED_PRECISION">
<span class="sig-name descname"><span class="pre">MIXED_PRECISION</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'MIXED_PRECISION'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.MIXED_PRECISION" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.NO_QUANT">
<span class="sig-name descname"><span class="pre">NO_QUANT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'NO_QUANT'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.NO_QUANT" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.NVFP4">
<span class="sig-name descname"><span class="pre">NVFP4</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'NVFP4'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.NVFP4" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A16">
<span class="sig-name descname"><span class="pre">W4A16</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A16'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A16_AWQ">
<span class="sig-name descname"><span class="pre">W4A16_AWQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A16_AWQ'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16_AWQ" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A16_GPTQ">
<span class="sig-name descname"><span class="pre">W4A16_GPTQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A16_GPTQ'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16_GPTQ" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A8_AWQ">
<span class="sig-name descname"><span class="pre">W4A8_AWQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A8_AWQ'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_AWQ" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_CHANNEL">
<span class="sig-name descname"><span class="pre">W4A8_QSERVE_PER_CHANNEL</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A8_QSERVE_PER_CHANNEL'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_CHANNEL" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_GROUP">
<span class="sig-name descname"><span class="pre">W4A8_QSERVE_PER_GROUP</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W4A8_QSERVE_PER_GROUP'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_GROUP" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A16">
<span class="sig-name descname"><span class="pre">W8A16</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A16'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A16" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A16_GPTQ">
<span class="sig-name descname"><span class="pre">W8A16_GPTQ</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A16_GPTQ'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A16_GPTQ" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_CHANNEL</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_CHANNEL'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN">
<span class="sig-name descname"><span class="pre">W8A8_SQ_PER_TENSOR_PLUGIN</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'W8A8_SQ_PER_TENSOR_PLUGIN'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">CalibConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'cuda'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'cpu'</span></span><span class="p"><span class="pre">]</span></span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cuda'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">calib_dataset</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">'cnn_dailymail'</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">calib_batches</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">calib_batch_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">calib_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">512</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">random_seed</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1234</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">tokenizer_max_seq_length</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">2048</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#CalibConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">BaseModel</span></code></p>
<p>Calibration configuration.</p>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.calib_batch_size">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">calib_batch_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.calib_batch_size" title="Link to this definition">#</a></dt>
<dd><p>The batch size that the calibration runs.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.calib_batches">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">calib_batches</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">512</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.calib_batches" title="Link to this definition">#</a></dt>
<dd><p>The number of batches that the calibration runs.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.calib_dataset">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">calib_dataset</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">str</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'cnn_dailymail'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.calib_dataset" title="Link to this definition">#</a></dt>
<dd><p>The name or local path of calibration dataset.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.calib_max_seq_length">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">calib_max_seq_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">512</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.calib_max_seq_length" title="Link to this definition">#</a></dt>
<dd><p>The maximum sequence length that the calibration runs.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.device">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">device</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Literal</span><span class="p"><span class="pre">[</span></span><span class="s"><span class="pre">'cuda'</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="s"><span class="pre">'cpu'</span></span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'cuda'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.device" title="Link to this definition">#</a></dt>
<dd><p>The device to run calibration.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.from_dict">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">from_dict</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">config</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig" title="tensorrt_llm.llmapi.llm_args.CalibConfig"><span class="pre">CalibConfig</span></a></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#CalibConfig.from_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.from_dict" title="Link to this definition">#</a></dt>
<dd><p>Create a CalibConfig instance from a dict.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>config</strong> (<em>dict</em>) The dict used to create CalibConfig.</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>The CalibConfig created from dict.</p>
</dd>
<dt class="field-odd">Return type<span class="colon">:</span></dt>
<dd class="field-odd"><p><a class="reference internal" href="#tensorrt_llm.llmapi.CalibConfig" title="tensorrt_llm.llmapi.CalibConfig">tensorrt_llm.llmapi.CalibConfig</a></p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.random_seed">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">random_seed</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">1234</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.random_seed" title="Link to this definition">#</a></dt>
<dd><p>The random seed used for calibration.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.to_dict">
<span class="sig-name descname"><span class="pre">to_dict</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">dict</span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#CalibConfig.to_dict"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.to_dict" title="Link to this definition">#</a></dt>
<dd><p>Dump a CalibConfig instance to a dict.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>The dict dumped from CalibConfig.</p>
</dd>
<dt class="field-even">Return type<span class="colon">:</span></dt>
<dd class="field-even"><p>dict</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CalibConfig.tokenizer_max_seq_length">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">tokenizer_max_seq_length</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">2048</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CalibConfig.tokenizer_max_seq_length" title="Link to this definition">#</a></dt>
<dd><p>The maximum sequence length to initialize tokenizer for calibration.</p>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">BuildCacheConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">cache_root</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_records</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">10</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_cache_storage_gb</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">256</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/build_cache.html#BuildCacheConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Configuration for the build cache.</p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig.cache_root">
<span class="sig-name descname"><span class="pre">cache_root</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig.cache_root" title="Link to this definition">#</a></dt>
<dd><p>The root directory for the build cache.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>str</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig.max_records">
<span class="sig-name descname"><span class="pre">max_records</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig.max_records" title="Link to this definition">#</a></dt>
<dd><p>The maximum number of records to store in the cache.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>int</p>
</dd>
</dl>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig.max_cache_storage_gb">
<span class="sig-name descname"><span class="pre">max_cache_storage_gb</span></span><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig.max_cache_storage_gb" title="Link to this definition">#</a></dt>
<dd><p>The maximum amount of storage (in GB) to use for the cache.</p>
<dl class="field-list simple">
<dt class="field-odd">Type<span class="colon">:</span></dt>
<dd class="field-odd"><p>float</p>
</dd>
</dl>
</dd></dl>
<div class="admonition note">
<p class="admonition-title">Note</p>
<p>The build-cache assumes the weights of the model are not changed during the execution. If the weights are
changed, you should remove the caches manually.</p>
</div>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BuildCacheConfig.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">cache_root</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Path</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_records</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">10</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">max_cache_storage_gb</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">float</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">256</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/build_cache.html#BuildCacheConfig.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BuildCacheConfig.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id7">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cache_root</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">Path</span></em><a class="headerlink" href="#id7" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id8">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_cache_storage_gb</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">float</span></em><a class="headerlink" href="#id8" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="id9">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_records</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><a class="headerlink" href="#id9" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.RequestError">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">RequestError</span></span><a class="reference internal" href="../_modules/tensorrt_llm/executor/utils.html#RequestError"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.RequestError" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">RuntimeError</span></code></p>
<p>The error raised when the request is failed.</p>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MpiCommSession">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">MpiCommSession</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">comm</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_workers</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/mpi_session.html#MpiCommSession"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MpiCommSession" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">MpiSession</span></code></p>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MpiCommSession.__init__">
<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">comm</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_workers</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">1</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/mpi_session.html#MpiCommSession.__init__"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MpiCommSession.__init__" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MpiCommSession.abort">
<span class="sig-name descname"><span class="pre">abort</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/mpi_session.html#MpiCommSession.abort"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MpiCommSession.abort" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MpiCommSession.get_comm">
<span class="sig-name descname"><span class="pre">get_comm</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/mpi_session.html#MpiCommSession.get_comm"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MpiCommSession.get_comm" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MpiCommSession.shutdown">
<span class="sig-name descname"><span class="pre">shutdown</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">wait</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/mpi_session.html#MpiCommSession.shutdown"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MpiCommSession.shutdown" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MpiCommSession.submit">
<span class="sig-name descname"><span class="pre">submit</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">task</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Callable</span><span class="p"><span class="pre">[</span></span><span class="p"><span class="pre">[</span></span><span class="p"><span class="pre">...</span></span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">T</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">Future</span><span class="p"><span class="pre">[</span></span><span class="pre">T</span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/mpi_session.html#MpiCommSession.submit"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MpiCommSession.submit" title="Link to this definition">#</a></dt>
<dd><p>Submit a task to MPI workers.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>task</strong> The task to be submitted.</p></li>
<li><p><strong>args</strong> Positional arguments for the task.</p></li>
<li><p><strong>kwargs</strong> Keyword arguments for the task.</p></li>
</ul>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.MpiCommSession.submit_sync">
<span class="sig-name descname"><span class="pre">submit_sync</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">task</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Callable</span><span class="p"><span class="pre">[</span></span><span class="p"><span class="pre">[</span></span><span class="p"><span class="pre">...</span></span><span class="p"><span class="pre">]</span></span><span class="p"><span class="pre">,</span></span><span class="w"> </span><span class="pre">T</span><span class="p"><span class="pre">]</span></span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>,</dd>
<dd><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><span class="pre">List</span><span class="p"><span class="pre">[</span></span><span class="pre">T</span><span class="p"><span class="pre">]</span></span></span></span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/mpi_session.html#MpiCommSession.submit_sync"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.MpiCommSession.submit_sync" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">ExtendedRuntimePerfKnobConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">multi_block_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">True</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_context_fmha_fp32_acc</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cuda_graph_mode</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">False</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">cuda_graph_cache_size</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">0</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#ExtendedRuntimePerfKnobConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">BaseModel</span></code>, <code class="xref py py-class docutils literal notranslate"><span class="pre">PybindMirror</span></code></p>
<p>Configuration for extended runtime performance knobs.</p>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_cache_size">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cuda_graph_cache_size</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">0</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_cache_size" title="Link to this definition">#</a></dt>
<dd><p>Number of cuda graphs to be cached in the runtime. The larger the cache, the better the perf, but more GPU memory is consumed.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_mode">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">cuda_graph_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_mode" title="Link to this definition">#</a></dt>
<dd><p>Whether to use CUDA graph mode.</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.enable_context_fmha_fp32_acc">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_context_fmha_fp32_acc</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">False</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.enable_context_fmha_fp32_acc" title="Link to this definition">#</a></dt>
<dd><p>Whether to enable context FMHA FP32 accumulation.</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.multi_block_mode">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">multi_block_mode</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">True</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.multi_block_mode" title="Link to this definition">#</a></dt>
<dd><p>Whether to use multi-block mode.</p>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BatchingType">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">BatchingType</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">value</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">names=&lt;not</span> <span class="pre">given&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">*values</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">module=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">qualname=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">type=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">start=1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">boundary=None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#BatchingType"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.BatchingType" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">StrEnum</span></code></p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BatchingType.INFLIGHT">
<span class="sig-name descname"><span class="pre">INFLIGHT</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'INFLIGHT'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BatchingType.INFLIGHT" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.BatchingType.STATIC">
<span class="sig-name descname"><span class="pre">STATIC</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'STATIC'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.BatchingType.STATIC" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.ContextChunkingPolicy">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">ContextChunkingPolicy</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="n"><span class="pre">value</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">names=&lt;not</span> <span class="pre">given&gt;</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">*values</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">module=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">qualname=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">type=None</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">start=1</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">boundary=None</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#ContextChunkingPolicy"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.ContextChunkingPolicy" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">StrEnum</span></code></p>
<p>Context chunking policy.</p>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.ContextChunkingPolicy.EQUAL_PROGRESS">
<span class="sig-name descname"><span class="pre">EQUAL_PROGRESS</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'EQUAL_PROGRESS'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.ContextChunkingPolicy.EQUAL_PROGRESS" title="Link to this definition">#</a></dt>
<dd></dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.ContextChunkingPolicy.FIRST_COME_FIRST_SERVED">
<span class="sig-name descname"><span class="pre">FIRST_COME_FIRST_SERVED</span></span><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">'FIRST_COME_FIRST_SERVED'</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.ContextChunkingPolicy.FIRST_COME_FIRST_SERVED" title="Link to this definition">#</a></dt>
<dd></dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DynamicBatchConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">DynamicBatchConfig</span></span><span class="sig-paren">(</span>
<dl>
<dd><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_batch_size_tuning</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">enable_max_num_tokens_tuning</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">bool</span></span></em>,</dd>
<dd><em class="sig-param"><span class="n"><span class="pre">dynamic_batch_moving_average_window</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span></em>,</dd>
</dl>
<span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#DynamicBatchConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.DynamicBatchConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">BaseModel</span></code>, <code class="xref py py-class docutils literal notranslate"><span class="pre">PybindMirror</span></code></p>
<p>Dynamic batch configuration.</p>
<p>Controls how batch size and token limits are dynamically adjusted at runtime.</p>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DynamicBatchConfig.dynamic_batch_moving_average_window">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">dynamic_batch_moving_average_window</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span></em><em class="property"> <span class="pre">[Required]</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.DynamicBatchConfig.dynamic_batch_moving_average_window" title="Link to this definition">#</a></dt>
<dd><p>The window size for moving average of input and output length which is used to calculate dynamic batch size and max num tokens</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DynamicBatchConfig.enable_batch_size_tuning">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_batch_size_tuning</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"> <span class="pre">[Required]</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.DynamicBatchConfig.enable_batch_size_tuning" title="Link to this definition">#</a></dt>
<dd><p>Controls if the batch size should be tuned dynamically</p>
</dd></dl>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DynamicBatchConfig.enable_max_num_tokens_tuning">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">enable_max_num_tokens_tuning</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">bool</span></em><em class="property"> <span class="pre">[Required]</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.DynamicBatchConfig.enable_max_num_tokens_tuning" title="Link to this definition">#</a></dt>
<dd><p>Controls if the max num tokens should be tuned dynamically</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.DynamicBatchConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.DynamicBatchConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CacheTransceiverConfig">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">tensorrt_llm.llmapi.</span></span><span class="sig-name descname"><span class="pre">CacheTransceiverConfig</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">max_num_tokens</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="../_modules/tensorrt_llm/llmapi/llm_args.html#CacheTransceiverConfig"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#tensorrt_llm.llmapi.CacheTransceiverConfig" title="Link to this definition">#</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">BaseModel</span></code>, <code class="xref py py-class docutils literal notranslate"><span class="pre">PybindMirror</span></code></p>
<p>Configuration for the cache transceiver.</p>
<dl class="py attribute pydantic_field">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CacheTransceiverConfig.max_num_tokens">
<em class="property"><span class="pre">field</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">max_num_tokens</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">int</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">None</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CacheTransceiverConfig.max_num_tokens" title="Link to this definition">#</a></dt>
<dd><p>The max number of tokens the transfer buffer can fit.</p>
</dd></dl>
<dl class="py attribute">
<dt class="sig sig-object py" id="tensorrt_llm.llmapi.CacheTransceiverConfig.model_config">
<span class="sig-name descname"><span class="pre">model_config</span></span><em class="property"><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="pre">ClassVar</span><span class="p"><span class="pre">[</span></span><span class="pre">ConfigDict</span><span class="p"><span class="pre">]</span></span></em><em class="property"><span class="w"> </span><span class="p"><span class="pre">=</span></span><span class="w"> </span><span class="pre">{}</span></em><a class="headerlink" href="#tensorrt_llm.llmapi.CacheTransceiverConfig.model_config" title="Link to this definition">#</a></dt>
<dd><p>Configuration for the model, should be a dictionary conforming to [<cite>ConfigDict</cite>][pydantic.config.ConfigDict].</p>
</dd></dl>
</dd></dl>
</section>
</article>
<footer class="prev-next-footer d-print-none">
<div class="prev-next-area">
<a class="left-prev"
href="index.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">API Introduction</p>
</div>
</a>
<a class="right-next"
href="../examples/index.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">LLM Examples Introduction</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div>
</footer>
</div>
<dialog id="pst-secondary-sidebar-modal"></dialog>
<div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div
id="pst-page-navigation-heading-2"
class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM"><code class="docutils literal notranslate"><span class="pre">LLM</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.tokenizer"><code class="docutils literal notranslate"><span class="pre">tokenizer</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.workspace"><code class="docutils literal notranslate"><span class="pre">workspace</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.generate"><code class="docutils literal notranslate"><span class="pre">generate()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.generate_async"><code class="docutils literal notranslate"><span class="pre">generate_async()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.get_kv_cache_events"><code class="docutils literal notranslate"><span class="pre">get_kv_cache_events()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.get_kv_cache_events_async"><code class="docutils literal notranslate"><span class="pre">get_kv_cache_events_async()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.get_stats"><code class="docutils literal notranslate"><span class="pre">get_stats()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.get_stats_async"><code class="docutils literal notranslate"><span class="pre">get_stats_async()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.save"><code class="docutils literal notranslate"><span class="pre">save()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LLM.shutdown"><code class="docutils literal notranslate"><span class="pre">shutdown()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id0"><code class="docutils literal notranslate"><span class="pre">tokenizer</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id1"><code class="docutils literal notranslate"><span class="pre">workspace</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput"><code class="docutils literal notranslate"><span class="pre">CompletionOutput</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.length"><code class="docutils literal notranslate"><span class="pre">length</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.token_ids_diff"><code class="docutils literal notranslate"><span class="pre">token_ids_diff</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.logprobs_diff"><code class="docutils literal notranslate"><span class="pre">logprobs_diff</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.text_diff"><code class="docutils literal notranslate"><span class="pre">text_diff</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.cumulative_logprob"><code class="docutils literal notranslate"><span class="pre">cumulative_logprob</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.disaggregated_params"><code class="docutils literal notranslate"><span class="pre">disaggregated_params</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.finish_reason"><code class="docutils literal notranslate"><span class="pre">finish_reason</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.generation_logits"><code class="docutils literal notranslate"><span class="pre">generation_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.index"><code class="docutils literal notranslate"><span class="pre">index</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id2"><code class="docutils literal notranslate"><span class="pre">length</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.logprobs"><code class="docutils literal notranslate"><span class="pre">logprobs</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id3"><code class="docutils literal notranslate"><span class="pre">logprobs_diff</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.prompt_logprobs"><code class="docutils literal notranslate"><span class="pre">prompt_logprobs</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.stop_reason"><code class="docutils literal notranslate"><span class="pre">stop_reason</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.text"><code class="docutils literal notranslate"><span class="pre">text</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id4"><code class="docutils literal notranslate"><span class="pre">text_diff</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CompletionOutput.token_ids"><code class="docutils literal notranslate"><span class="pre">token_ids</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id5"><code class="docutils literal notranslate"><span class="pre">token_ids_diff</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.RequestOutput"><code class="docutils literal notranslate"><span class="pre">RequestOutput</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.RequestOutput.request_id"><code class="docutils literal notranslate"><span class="pre">request_id</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.RequestOutput.prompt"><code class="docutils literal notranslate"><span class="pre">prompt</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.RequestOutput.prompt_token_ids"><code class="docutils literal notranslate"><span class="pre">prompt_token_ids</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.RequestOutput.outputs"><code class="docutils literal notranslate"><span class="pre">outputs</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.RequestOutput.context_logits"><code class="docutils literal notranslate"><span class="pre">context_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.RequestOutput.finished"><code class="docutils literal notranslate"><span class="pre">finished</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.RequestOutput.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id6"><code class="docutils literal notranslate"><span class="pre">prompt</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.GuidedDecodingParams"><code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.GuidedDecodingParams.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.GuidedDecodingParams.grammar"><code class="docutils literal notranslate"><span class="pre">grammar</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.GuidedDecodingParams.json"><code class="docutils literal notranslate"><span class="pre">json</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.GuidedDecodingParams.json_object"><code class="docutils literal notranslate"><span class="pre">json_object</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.GuidedDecodingParams.regex"><code class="docutils literal notranslate"><span class="pre">regex</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.GuidedDecodingParams.structural_tag"><code class="docutils literal notranslate"><span class="pre">structural_tag</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams"><code class="docutils literal notranslate"><span class="pre">SamplingParams</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.add_special_tokens"><code class="docutils literal notranslate"><span class="pre">add_special_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.additional_model_outputs"><code class="docutils literal notranslate"><span class="pre">additional_model_outputs</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.apply_batched_logits_processor"><code class="docutils literal notranslate"><span class="pre">apply_batched_logits_processor</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.bad"><code class="docutils literal notranslate"><span class="pre">bad</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.bad_token_ids"><code class="docutils literal notranslate"><span class="pre">bad_token_ids</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.beam_search_diversity_rate"><code class="docutils literal notranslate"><span class="pre">beam_search_diversity_rate</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.beam_width_array"><code class="docutils literal notranslate"><span class="pre">beam_width_array</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.best_of"><code class="docutils literal notranslate"><span class="pre">best_of</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.detokenize"><code class="docutils literal notranslate"><span class="pre">detokenize</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.early_stopping"><code class="docutils literal notranslate"><span class="pre">early_stopping</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.embedding_bias"><code class="docutils literal notranslate"><span class="pre">embedding_bias</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.end_id"><code class="docutils literal notranslate"><span class="pre">end_id</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.exclude_input_from_output"><code class="docutils literal notranslate"><span class="pre">exclude_input_from_output</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.frequency_penalty"><code class="docutils literal notranslate"><span class="pre">frequency_penalty</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.guided_decoding"><code class="docutils literal notranslate"><span class="pre">guided_decoding</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.ignore_eos"><code class="docutils literal notranslate"><span class="pre">ignore_eos</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.include_stop_str_in_output"><code class="docutils literal notranslate"><span class="pre">include_stop_str_in_output</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.length_penalty"><code class="docutils literal notranslate"><span class="pre">length_penalty</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.logits_processor"><code class="docutils literal notranslate"><span class="pre">logits_processor</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.logprobs"><code class="docutils literal notranslate"><span class="pre">logprobs</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.lookahead_config"><code class="docutils literal notranslate"><span class="pre">lookahead_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.max_tokens"><code class="docutils literal notranslate"><span class="pre">max_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.min_p"><code class="docutils literal notranslate"><span class="pre">min_p</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.min_tokens"><code class="docutils literal notranslate"><span class="pre">min_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.n"><code class="docutils literal notranslate"><span class="pre">n</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.no_repeat_ngram_size"><code class="docutils literal notranslate"><span class="pre">no_repeat_ngram_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.pad_id"><code class="docutils literal notranslate"><span class="pre">pad_id</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.presence_penalty"><code class="docutils literal notranslate"><span class="pre">presence_penalty</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.prompt_logprobs"><code class="docutils literal notranslate"><span class="pre">prompt_logprobs</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.repetition_penalty"><code class="docutils literal notranslate"><span class="pre">repetition_penalty</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.return_context_logits"><code class="docutils literal notranslate"><span class="pre">return_context_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.return_encoder_output"><code class="docutils literal notranslate"><span class="pre">return_encoder_output</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.return_generation_logits"><code class="docutils literal notranslate"><span class="pre">return_generation_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.return_perf_metrics"><code class="docutils literal notranslate"><span class="pre">return_perf_metrics</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.seed"><code class="docutils literal notranslate"><span class="pre">seed</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.skip_special_tokens"><code class="docutils literal notranslate"><span class="pre">skip_special_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.spaces_between_special_tokens"><code class="docutils literal notranslate"><span class="pre">spaces_between_special_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.stop"><code class="docutils literal notranslate"><span class="pre">stop</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.stop_token_ids"><code class="docutils literal notranslate"><span class="pre">stop_token_ids</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.temperature"><code class="docutils literal notranslate"><span class="pre">temperature</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.top_k"><code class="docutils literal notranslate"><span class="pre">top_k</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.top_p"><code class="docutils literal notranslate"><span class="pre">top_p</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.top_p_decay"><code class="docutils literal notranslate"><span class="pre">top_p_decay</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.top_p_min"><code class="docutils literal notranslate"><span class="pre">top_p_min</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.top_p_reset_ids"><code class="docutils literal notranslate"><span class="pre">top_p_reset_ids</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.truncate_prompt_tokens"><code class="docutils literal notranslate"><span class="pre">truncate_prompt_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SamplingParams.use_beam_search"><code class="docutils literal notranslate"><span class="pre">use_beam_search</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DisaggregatedParams"><code class="docutils literal notranslate"><span class="pre">DisaggregatedParams</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DisaggregatedParams.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DisaggregatedParams.ctx_request_id"><code class="docutils literal notranslate"><span class="pre">ctx_request_id</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DisaggregatedParams.draft_tokens"><code class="docutils literal notranslate"><span class="pre">draft_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DisaggregatedParams.first_gen_tokens"><code class="docutils literal notranslate"><span class="pre">first_gen_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DisaggregatedParams.get_context_phase_params"><code class="docutils literal notranslate"><span class="pre">get_context_phase_params()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DisaggregatedParams.get_request_type"><code class="docutils literal notranslate"><span class="pre">get_request_type()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DisaggregatedParams.opaque_state"><code class="docutils literal notranslate"><span class="pre">opaque_state</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DisaggregatedParams.request_type"><code class="docutils literal notranslate"><span class="pre">request_type</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig"><code class="docutils literal notranslate"><span class="pre">KvCacheConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.copy_on_partial_reuse"><code class="docutils literal notranslate"><span class="pre">copy_on_partial_reuse</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.cross_kv_cache_fraction"><code class="docutils literal notranslate"><span class="pre">cross_kv_cache_fraction</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.enable_block_reuse"><code class="docutils literal notranslate"><span class="pre">enable_block_reuse</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.enable_partial_reuse"><code class="docutils literal notranslate"><span class="pre">enable_partial_reuse</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.event_buffer_max_size"><code class="docutils literal notranslate"><span class="pre">event_buffer_max_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.free_gpu_memory_fraction"><code class="docutils literal notranslate"><span class="pre">free_gpu_memory_fraction</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.host_cache_size"><code class="docutils literal notranslate"><span class="pre">host_cache_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.max_attention_window"><code class="docutils literal notranslate"><span class="pre">max_attention_window</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.max_tokens"><code class="docutils literal notranslate"><span class="pre">max_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.onboard_blocks"><code class="docutils literal notranslate"><span class="pre">onboard_blocks</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.secondary_offload_min_priority"><code class="docutils literal notranslate"><span class="pre">secondary_offload_min_priority</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheConfig.sink_token_length"><code class="docutils literal notranslate"><span class="pre">sink_token_length</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig"><code class="docutils literal notranslate"><span class="pre">KvCacheRetentionConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig"><code class="docutils literal notranslate"><span class="pre">TokenRangeRetentionConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.duration_ms"><code class="docutils literal notranslate"><span class="pre">duration_ms</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.priority"><code class="docutils literal notranslate"><span class="pre">priority</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_end"><code class="docutils literal notranslate"><span class="pre">token_end</span></code></a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig.token_start"><code class="docutils literal notranslate"><span class="pre">token_start</span></code></a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_duration_ms"><code class="docutils literal notranslate"><span class="pre">decode_duration_ms</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.decode_retention_priority"><code class="docutils literal notranslate"><span class="pre">decode_retention_priority</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.KvCacheRetentionConfig.token_range_retention_configs"><code class="docutils literal notranslate"><span class="pre">token_range_retention_configs</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig"><code class="docutils literal notranslate"><span class="pre">LookaheadDecodingConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.calculate_speculative_resource"><code class="docutils literal notranslate"><span class="pre">calculate_speculative_resource()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.decoding_type"><code class="docutils literal notranslate"><span class="pre">decoding_type</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">from_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_ngram_size"><code class="docutils literal notranslate"><span class="pre">max_ngram_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_verification_set_size"><code class="docutils literal notranslate"><span class="pre">max_verification_set_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.max_window_size"><code class="docutils literal notranslate"><span class="pre">max_window_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.LookaheadDecodingConfig.validate_positive_values"><code class="docutils literal notranslate"><span class="pre">validate_positive_values()</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MedusaDecodingConfig"><code class="docutils literal notranslate"><span class="pre">MedusaDecodingConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.decoding_type"><code class="docutils literal notranslate"><span class="pre">decoding_type</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">from_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.medusa_choices"><code class="docutils literal notranslate"><span class="pre">medusa_choices</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MedusaDecodingConfig.num_medusa_heads"><code class="docutils literal notranslate"><span class="pre">num_medusa_heads</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig"><code class="docutils literal notranslate"><span class="pre">EagleDecodingConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.decoding_type"><code class="docutils literal notranslate"><span class="pre">decoding_type</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.dynamic_tree_max_topK"><code class="docutils literal notranslate"><span class="pre">dynamic_tree_max_topK</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.eagle_choices"><code class="docutils literal notranslate"><span class="pre">eagle_choices</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">from_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.greedy_sampling"><code class="docutils literal notranslate"><span class="pre">greedy_sampling</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.max_non_leaves_per_layer"><code class="docutils literal notranslate"><span class="pre">max_non_leaves_per_layer</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.num_eagle_layers"><code class="docutils literal notranslate"><span class="pre">num_eagle_layers</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.posterior_threshold"><code class="docutils literal notranslate"><span class="pre">posterior_threshold</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.pytorch_eagle_weights_path"><code class="docutils literal notranslate"><span class="pre">pytorch_eagle_weights_path</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.EagleDecodingConfig.use_dynamic_tree"><code class="docutils literal notranslate"><span class="pre">use_dynamic_tree</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MTPDecodingConfig"><code class="docutils literal notranslate"><span class="pre">MTPDecodingConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MTPDecodingConfig.decoding_type"><code class="docutils literal notranslate"><span class="pre">decoding_type</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MTPDecodingConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">from_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MTPDecodingConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MTPDecodingConfig.num_nextn_predict_layers"><code class="docutils literal notranslate"><span class="pre">num_nextn_predict_layers</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_delta"><code class="docutils literal notranslate"><span class="pre">relaxed_delta</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MTPDecodingConfig.relaxed_topk"><code class="docutils literal notranslate"><span class="pre">relaxed_topk</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MTPDecodingConfig.use_relaxed_acceptance_for_thinking"><code class="docutils literal notranslate"><span class="pre">use_relaxed_acceptance_for_thinking</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SchedulerConfig"><code class="docutils literal notranslate"><span class="pre">SchedulerConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SchedulerConfig.capacity_scheduler_policy"><code class="docutils literal notranslate"><span class="pre">capacity_scheduler_policy</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SchedulerConfig.context_chunking_policy"><code class="docutils literal notranslate"><span class="pre">context_chunking_policy</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SchedulerConfig.dynamic_batch_config"><code class="docutils literal notranslate"><span class="pre">dynamic_batch_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.SchedulerConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy"><code class="docutils literal notranslate"><span class="pre">CapacitySchedulerPolicy</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.GUARANTEED_NO_EVICT"><code class="docutils literal notranslate"><span class="pre">GUARANTEED_NO_EVICT</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.MAX_UTILIZATION"><code class="docutils literal notranslate"><span class="pre">MAX_UTILIZATION</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CapacitySchedulerPolicy.STATIC_BATCH"><code class="docutils literal notranslate"><span class="pre">STATIC_BATCH</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig"><code class="docutils literal notranslate"><span class="pre">BuildConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.auto_parallel_config"><code class="docutils literal notranslate"><span class="pre">auto_parallel_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.dry_run"><code class="docutils literal notranslate"><span class="pre">dry_run</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.enable_debug_output"><code class="docutils literal notranslate"><span class="pre">enable_debug_output</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.force_num_profiles"><code class="docutils literal notranslate"><span class="pre">force_num_profiles</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">from_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.from_json_file"><code class="docutils literal notranslate"><span class="pre">from_json_file()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.gather_context_logits"><code class="docutils literal notranslate"><span class="pre">gather_context_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.gather_generation_logits"><code class="docutils literal notranslate"><span class="pre">gather_generation_logits</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.input_timing_cache"><code class="docutils literal notranslate"><span class="pre">input_timing_cache</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.kv_cache_type"><code class="docutils literal notranslate"><span class="pre">kv_cache_type</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.lora_config"><code class="docutils literal notranslate"><span class="pre">lora_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.max_batch_size"><code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.max_beam_width"><code class="docutils literal notranslate"><span class="pre">max_beam_width</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.max_draft_len"><code class="docutils literal notranslate"><span class="pre">max_draft_len</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.max_encoder_input_len"><code class="docutils literal notranslate"><span class="pre">max_encoder_input_len</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.max_input_len"><code class="docutils literal notranslate"><span class="pre">max_input_len</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.max_num_tokens"><code class="docutils literal notranslate"><span class="pre">max_num_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.max_prompt_embedding_table_size"><code class="docutils literal notranslate"><span class="pre">max_prompt_embedding_table_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.max_seq_len"><code class="docutils literal notranslate"><span class="pre">max_seq_len</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.monitor_memory"><code class="docutils literal notranslate"><span class="pre">monitor_memory</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.opt_batch_size"><code class="docutils literal notranslate"><span class="pre">opt_batch_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.opt_num_tokens"><code class="docutils literal notranslate"><span class="pre">opt_num_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.output_timing_cache"><code class="docutils literal notranslate"><span class="pre">output_timing_cache</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.plugin_config"><code class="docutils literal notranslate"><span class="pre">plugin_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.profiling_verbosity"><code class="docutils literal notranslate"><span class="pre">profiling_verbosity</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.speculative_decoding_mode"><code class="docutils literal notranslate"><span class="pre">speculative_decoding_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.strongly_typed"><code class="docutils literal notranslate"><span class="pre">strongly_typed</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.to_dict"><code class="docutils literal notranslate"><span class="pre">to_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.update"><code class="docutils literal notranslate"><span class="pre">update()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.update_from_dict"><code class="docutils literal notranslate"><span class="pre">update_from_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.update_kv_cache_type"><code class="docutils literal notranslate"><span class="pre">update_kv_cache_type()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.use_mrope"><code class="docutils literal notranslate"><span class="pre">use_mrope</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.use_refit"><code class="docutils literal notranslate"><span class="pre">use_refit</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.use_strip_plan"><code class="docutils literal notranslate"><span class="pre">use_strip_plan</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.visualize_network"><code class="docutils literal notranslate"><span class="pre">visualize_network</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.weight_sparsity"><code class="docutils literal notranslate"><span class="pre">weight_sparsity</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildConfig.weight_streaming"><code class="docutils literal notranslate"><span class="pre">weight_streaming</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig"><code class="docutils literal notranslate"><span class="pre">QuantConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.clamp_val"><code class="docutils literal notranslate"><span class="pre">clamp_val</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.exclude_modules"><code class="docutils literal notranslate"><span class="pre">exclude_modules</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">from_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.group_size"><code class="docutils literal notranslate"><span class="pre">group_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.has_zero_point"><code class="docutils literal notranslate"><span class="pre">has_zero_point</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.is_module_excluded_from_quantization"><code class="docutils literal notranslate"><span class="pre">is_module_excluded_from_quantization()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.kv_cache_quant_algo"><code class="docutils literal notranslate"><span class="pre">kv_cache_quant_algo</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.layer_quant_mode"><code class="docutils literal notranslate"><span class="pre">layer_quant_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.pre_quant_scale"><code class="docutils literal notranslate"><span class="pre">pre_quant_scale</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.quant_algo"><code class="docutils literal notranslate"><span class="pre">quant_algo</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.quant_mode"><code class="docutils literal notranslate"><span class="pre">quant_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.smoothquant_val"><code class="docutils literal notranslate"><span class="pre">smoothquant_val</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.to_dict"><code class="docutils literal notranslate"><span class="pre">to_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantConfig.use_meta_recipe"><code class="docutils literal notranslate"><span class="pre">use_meta_recipe</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo"><code class="docutils literal notranslate"><span class="pre">QuantAlgo</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.FP8"><code class="docutils literal notranslate"><span class="pre">FP8</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.FP8_BLOCK_SCALES"><code class="docutils literal notranslate"><span class="pre">FP8_BLOCK_SCALES</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN"><code class="docutils literal notranslate"><span class="pre">FP8_PER_CHANNEL_PER_TOKEN</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.INT8"><code class="docutils literal notranslate"><span class="pre">INT8</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.MIXED_PRECISION"><code class="docutils literal notranslate"><span class="pre">MIXED_PRECISION</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.NO_QUANT"><code class="docutils literal notranslate"><span class="pre">NO_QUANT</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.NVFP4"><code class="docutils literal notranslate"><span class="pre">NVFP4</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16"><code class="docutils literal notranslate"><span class="pre">W4A16</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16_AWQ"><code class="docutils literal notranslate"><span class="pre">W4A16_AWQ</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W4A16_GPTQ"><code class="docutils literal notranslate"><span class="pre">W4A16_GPTQ</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_AWQ"><code class="docutils literal notranslate"><span class="pre">W4A8_AWQ</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_CHANNEL"><code class="docutils literal notranslate"><span class="pre">W4A8_QSERVE_PER_CHANNEL</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W4A8_QSERVE_PER_GROUP"><code class="docutils literal notranslate"><span class="pre">W4A8_QSERVE_PER_GROUP</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W8A16"><code class="docutils literal notranslate"><span class="pre">W8A16</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W8A16_GPTQ"><code class="docutils literal notranslate"><span class="pre">W8A16_GPTQ</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL"><code class="docutils literal notranslate"><span class="pre">W8A8_SQ_PER_CHANNEL</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN"><code class="docutils literal notranslate"><span class="pre">W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN"><code class="docutils literal notranslate"><span class="pre">W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN"><code class="docutils literal notranslate"><span class="pre">W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN"><code class="docutils literal notranslate"><span class="pre">W8A8_SQ_PER_TENSOR_PLUGIN</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig"><code class="docutils literal notranslate"><span class="pre">CalibConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.calib_batch_size"><code class="docutils literal notranslate"><span class="pre">calib_batch_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.calib_batches"><code class="docutils literal notranslate"><span class="pre">calib_batches</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.calib_dataset"><code class="docutils literal notranslate"><span class="pre">calib_dataset</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.calib_max_seq_length"><code class="docutils literal notranslate"><span class="pre">calib_max_seq_length</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.device"><code class="docutils literal notranslate"><span class="pre">device</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.from_dict"><code class="docutils literal notranslate"><span class="pre">from_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.random_seed"><code class="docutils literal notranslate"><span class="pre">random_seed</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.to_dict"><code class="docutils literal notranslate"><span class="pre">to_dict()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CalibConfig.tokenizer_max_seq_length"><code class="docutils literal notranslate"><span class="pre">tokenizer_max_seq_length</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildCacheConfig"><code class="docutils literal notranslate"><span class="pre">BuildCacheConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildCacheConfig.cache_root"><code class="docutils literal notranslate"><span class="pre">cache_root</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildCacheConfig.max_records"><code class="docutils literal notranslate"><span class="pre">max_records</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildCacheConfig.max_cache_storage_gb"><code class="docutils literal notranslate"><span class="pre">max_cache_storage_gb</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BuildCacheConfig.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id7"><code class="docutils literal notranslate"><span class="pre">cache_root</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id8"><code class="docutils literal notranslate"><span class="pre">max_cache_storage_gb</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#id9"><code class="docutils literal notranslate"><span class="pre">max_records</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.RequestError"><code class="docutils literal notranslate"><span class="pre">RequestError</span></code></a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MpiCommSession"><code class="docutils literal notranslate"><span class="pre">MpiCommSession</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MpiCommSession.__init__"><code class="docutils literal notranslate"><span class="pre">__init__()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MpiCommSession.abort"><code class="docutils literal notranslate"><span class="pre">abort()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MpiCommSession.get_comm"><code class="docutils literal notranslate"><span class="pre">get_comm()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MpiCommSession.shutdown"><code class="docutils literal notranslate"><span class="pre">shutdown()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MpiCommSession.submit"><code class="docutils literal notranslate"><span class="pre">submit()</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.MpiCommSession.submit_sync"><code class="docutils literal notranslate"><span class="pre">submit_sync()</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig"><code class="docutils literal notranslate"><span class="pre">ExtendedRuntimePerfKnobConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_cache_size"><code class="docutils literal notranslate"><span class="pre">cuda_graph_cache_size</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.cuda_graph_mode"><code class="docutils literal notranslate"><span class="pre">cuda_graph_mode</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.enable_context_fmha_fp32_acc"><code class="docutils literal notranslate"><span class="pre">enable_context_fmha_fp32_acc</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.ExtendedRuntimePerfKnobConfig.multi_block_mode"><code class="docutils literal notranslate"><span class="pre">multi_block_mode</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BatchingType"><code class="docutils literal notranslate"><span class="pre">BatchingType</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BatchingType.INFLIGHT"><code class="docutils literal notranslate"><span class="pre">INFLIGHT</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.BatchingType.STATIC"><code class="docutils literal notranslate"><span class="pre">STATIC</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.ContextChunkingPolicy"><code class="docutils literal notranslate"><span class="pre">ContextChunkingPolicy</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.ContextChunkingPolicy.EQUAL_PROGRESS"><code class="docutils literal notranslate"><span class="pre">EQUAL_PROGRESS</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.ContextChunkingPolicy.FIRST_COME_FIRST_SERVED"><code class="docutils literal notranslate"><span class="pre">FIRST_COME_FIRST_SERVED</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DynamicBatchConfig"><code class="docutils literal notranslate"><span class="pre">DynamicBatchConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DynamicBatchConfig.dynamic_batch_moving_average_window"><code class="docutils literal notranslate"><span class="pre">dynamic_batch_moving_average_window</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DynamicBatchConfig.enable_batch_size_tuning"><code class="docutils literal notranslate"><span class="pre">enable_batch_size_tuning</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DynamicBatchConfig.enable_max_num_tokens_tuning"><code class="docutils literal notranslate"><span class="pre">enable_max_num_tokens_tuning</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.DynamicBatchConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CacheTransceiverConfig"><code class="docutils literal notranslate"><span class="pre">CacheTransceiverConfig</span></code></a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CacheTransceiverConfig.max_num_tokens"><code class="docutils literal notranslate"><span class="pre">max_num_tokens</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#tensorrt_llm.llmapi.CacheTransceiverConfig.model_config"><code class="docutils literal notranslate"><span class="pre">model_config</span></code></a></li>
</ul>
</li>
</ul>
</nav></div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
<script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item">
<a class="footer-brand logo" href="https://www.nvidia.com">
<img src="../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
<img src="../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
</a></div>
<div class="footer-item">
<div class="footer-links">
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Manage My Privacy</a>
|
<a class="external" href="https://www.nvidia.com/en-us/preferences/start/">Do Not Sell or Share My Data</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
|
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
|
<a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
|
<a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>
</div>
</div>
<div class="footer-item">
<p class="copyright">
Copyright © 2025, NVidia.
<br/>
</p>
</div>
</div>
</div>
</footer>
</body>
</html>